Added filtering script and fixed bugs

1eb490b5 · Wijfjes, Raul · 0ba88762 · 1eb490b5 · 1eb490b5 · 1eb490b5
Commit 1eb490b5 authored 5 years ago by Wijfjes, Raul
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,12 +6,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## Unreleased

+## [0.3.0] - 2019-12-16
+### Added
+- Script to easily filter Hecaton output (nextflow/hecaton_filter_size_cutoff.nf)
+
 ### Changed
 - Added docker pull option in documentation
 - Corrected link to old repository in documentation

-## [0.2.2] - 2019-09-20
+### Fixed
+- Bug during merging in which homozygous insertions are reported for a sample, even if its VCF gives a reference or no call at that position   
+- Typo in hecaton.nf and hecaton_no_align.nf which caused speedseq -m parameter to be equal to the number of threads

+## [0.2.2] - 2019-09-20
 ### Added
 - Test to check if running Hecaton with pre-generated alignments works correctly

@@ -47,7 +54,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Initial release of Hecaton

-[Unreleased]: https://git.wur.nl/bioinformatics/hecaton/compare/v0.2.1...master
+[Unreleased]: https://git.wur.nl/bioinformatics/hecaton/compare/v0.3.0...master
+[0.3.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.3.0
+[0.2.2]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.2
 [0.2.1]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.1
 [0.2.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.0
 [0.1.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.1.0

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -71,7 +71,7 @@ RUN source activate hecaton_py2 && \
 # RUN git clone https://git.wur.nl/wijfj001/hecaton.git && cd hecaton && \
 #	git checkout e85bba0c && cd .. && \
 RUN git clone https://git.wur.nl/bioinformatics/hecaton.git && \
-	echo "8a8bba7a" && \
+	echo "0ba88762" && \
 	chmod +x hecaton/scripts/collapse/* && \
 	chmod +x hecaton/scripts/convert/* && \
 	chmod +x hecaton/scripts/filter/* && \

--- a/nextflow/hecaton.nf
+++ b/nextflow/hecaton.nf
@@ -186,7 +186,7 @@ process call_lumpy {
 	-D ${discordants_alignment_file} \
 	-S ${splitters_alignment_file} \
 	-R ${genome_file} \
-	-m ${task.cpus} &&
+	-m 1 &&
 	vcf_to_bedpe.py -i ${prefix}.sv.vcf.gz \
 	-o ${prefix}.bedpe -t LUMPY &&
 	process_simple_cnvs.py -i ${prefix}.sv.vcf.gz -o ${prefix}_post_processed.bedpe -t LUMPY &&

--- a/nextflow/hecaton_filter_size_cutoff.nf
+++ b/nextflow/hecaton_filter_size_cutoff.nf
+/*
+ *Run the filtering on size and quality score step of Hecaton
+ *
+ *@authors
+ *Raúl Wijfjes <raul.wijfjes@wur.nl>
+ *
+ *Date last modified: 29-11-2019
+ */
+
+params.probability_files = ""
+params.cutoff = 0.7
+params.output_dir = ""
+
+/*
+ * Input parameters validation and setup
+ */
+
+if (! params.output_dir ) {
+    println "Missing output directory parameter"
+    exit 1
+}
+if (! params.probability_files ) {
+    println "Missing probability files parameter"
+    exit 1
+}
+probability_files = Channel.fromPath(params.probability_files)
+			     .map { file -> tuple(getSampleId(file), file)}
+			     .groupTuple()
+
+def getSampleId(file) {
+	file.getBaseName()
+}
+
+/*
+ * Filter calls according to size and cutoff
+ */
+
+process filter_calls_cutoff {
+	publishDir "${params.output_dir}", mode: 'move' 
+	tag "Filter calls of ${prefix}. Cutoff: ${params.cutoff}" 
+
+	input:
+	set val(prefix), file(probability_file) from probability_files
+
+	output:
+	set val(prefix), file("${prefix}_filtered_cutoff_${params.cutoff}.bedpe")
+
+	script:
+	"""
+	source activate hecaton_py3
+    filter_calls_by_query.py \
+    -i ${probability_file} \
+    -q \"SIZE >= 50 & PREDICTION_1 >= ${params.cutoff}\" \
+    -o ${prefix}_filtered_cutoff_${params.cutoff}.bedpe
+    source deactivate
+	"""
+}
--- a/nextflow/hecaton_no_align.nf
+++ b/nextflow/hecaton_no_align.nf
@@ -151,7 +151,7 @@ process call_lumpy {
 	-x ${genome_N_file} \
 	-B ${alignment_file} \
 	-R ${genome_file} \
-	-m ${task.cpus} &&
+	-m 1 &&
 	vcf_to_bedpe.py -i ${prefix}.sv.vcf.gz \
 	-o ${prefix}.bedpe -t LUMPY &&
 	process_simple_cnvs.py -i ${prefix}.sv.vcf.gz -o ${prefix}_post_processed.bedpe -t LUMPY &&

--- a/scripts/genotype/merge_vcf_files.py
+++ b/scripts/genotype/merge_vcf_files.py
@@ -96,12 +96,20 @@ def obtain_sites_and_genotypes(input_fns, heterozygous=True):
                dhffc = record.samples[0]["DHFFC"]
                # set genotypes based on format
                gt = [".", "."]
+                non_calls = [(None, None), (0, 0)]
+                called_gt = record.samples[0]["GT"]
                if not heterozygous:
                    gt[0] = "1"
                    gt[1] = "1"
                else:
                    if sv_type == "INS":
-                        gt[1] = "1"
+                        # do not call INS if single sample shows reference only or no call
+                        if called_gt in non_calls:
+                            gt[0] = called_gt[0]
+                            gt[1] = called_gt[1]
+                        else: 
+                            # call can be only be ./1, based on available evidence
+                            gt[1] = "1"                        
                    elif sv_type == "DEL":
                        if dhfc >= 0 and dhfc < 0.25:
                            gt[0] = "1"