From 1eb490b5e430c37fc6e32d08dc7bae7428b473b6 Mon Sep 17 00:00:00 2001
From: Raul Wijfjes <raul.wijfjes@wur.nl>
Date: Mon, 16 Dec 2019 15:46:33 +0100
Subject: [PATCH] Added filtering script and fixed bugs

---
 CHANGELOG.md                           | 13 +++++-
 docker/Dockerfile                      |  2 +-
 nextflow/hecaton.nf                    |  2 +-
 nextflow/hecaton_filter_size_cutoff.nf | 57 ++++++++++++++++++++++++++
 nextflow/hecaton_no_align.nf           |  2 +-
 scripts/genotype/merge_vcf_files.py    | 10 ++++-
 6 files changed, 80 insertions(+), 6 deletions(-)
 create mode 100644 nextflow/hecaton_filter_size_cutoff.nf

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d894836..842f0cb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,12 +6,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [0.3.0] - 2019-12-16
+### Added
+- Script to easily filter Hecaton output (nextflow/hecaton_filter_size_cutoff.nf)
+
 ### Changed
 - Added docker pull option in documentation
 - Corrected link to old repository in documentation
 
-## [0.2.2] - 2019-09-20
+### Fixed
+- Bug during merging in which homozygous insertions are reported for a sample, even if its VCF gives a reference or no call at that position   
+- Typo in hecaton.nf and hecaton_no_align.nf which caused speedseq -m parameter to be equal to the number of threads
 
+## [0.2.2] - 2019-09-20
 ### Added
 - Test to check if running Hecaton with pre-generated alignments works correctly
 
@@ -47,7 +54,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Initial release of Hecaton
 
-[Unreleased]: https://git.wur.nl/bioinformatics/hecaton/compare/v0.2.1...master
+[Unreleased]: https://git.wur.nl/bioinformatics/hecaton/compare/v0.3.0...master
+[0.3.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.3.0
+[0.2.2]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.2
 [0.2.1]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.1
 [0.2.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.0
 [0.1.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.1.0
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 021b7cc..957afb0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -71,7 +71,7 @@ RUN source activate hecaton_py2 && \
 # RUN git clone https://git.wur.nl/wijfj001/hecaton.git && cd hecaton && \
 #	git checkout e85bba0c && cd .. && \
 RUN git clone https://git.wur.nl/bioinformatics/hecaton.git && \
-	echo "8a8bba7a" && \
+	echo "0ba88762" && \
 	chmod +x hecaton/scripts/collapse/* && \
 	chmod +x hecaton/scripts/convert/* && \
 	chmod +x hecaton/scripts/filter/* && \
diff --git a/nextflow/hecaton.nf b/nextflow/hecaton.nf
index 0312a41..edda155 100644
--- a/nextflow/hecaton.nf
+++ b/nextflow/hecaton.nf
@@ -186,7 +186,7 @@ process call_lumpy {
 	-D ${discordants_alignment_file} \
 	-S ${splitters_alignment_file} \
 	-R ${genome_file} \
-	-m ${task.cpus} &&
+	-m 1 &&
 	vcf_to_bedpe.py -i ${prefix}.sv.vcf.gz \
 	-o ${prefix}.bedpe -t LUMPY &&
 	process_simple_cnvs.py -i ${prefix}.sv.vcf.gz -o ${prefix}_post_processed.bedpe -t LUMPY &&
diff --git a/nextflow/hecaton_filter_size_cutoff.nf b/nextflow/hecaton_filter_size_cutoff.nf
new file mode 100644
index 0000000..e54b92b
--- /dev/null
+++ b/nextflow/hecaton_filter_size_cutoff.nf
@@ -0,0 +1,57 @@
+/*
+ *Run the filtering on size and quality score step of Hecaton
+ *
+ *@authors
+ *Raúl Wijfjes <raul.wijfjes@wur.nl>
+ *
+ *Date last modified: 29-11-2019
+ */
+
+params.probability_files = ""
+params.cutoff = 0.7
+params.output_dir = ""
+
+/*
+ * Input parameters validation and setup
+ */
+
+if (! params.output_dir ) {
+    println "Missing output directory parameter"
+    exit 1
+}
+if (! params.probability_files ) {
+    println "Missing probability files parameter"
+    exit 1
+}
+probability_files = Channel.fromPath(params.probability_files)
+			     .map { file -> tuple(getSampleId(file), file)}
+			     .groupTuple()
+
+def getSampleId(file) {
+	file.getBaseName()
+}
+
+/*
+ * Filter calls according to size and cutoff
+ */
+
+process filter_calls_cutoff {
+	publishDir "${params.output_dir}", mode: 'move' 
+	tag "Filter calls of ${prefix}. Cutoff: ${params.cutoff}" 
+
+	input:
+	set val(prefix), file(probability_file) from probability_files
+
+	output:
+	set val(prefix), file("${prefix}_filtered_cutoff_${params.cutoff}.bedpe")
+
+	script:
+	"""
+	source activate hecaton_py3
+    filter_calls_by_query.py \
+    -i ${probability_file} \
+    -q \"SIZE >= 50 & PREDICTION_1 >= ${params.cutoff}\" \
+    -o ${prefix}_filtered_cutoff_${params.cutoff}.bedpe
+    source deactivate
+	"""
+}
diff --git a/nextflow/hecaton_no_align.nf b/nextflow/hecaton_no_align.nf
index 145ff95..2c6a2b1 100644
--- a/nextflow/hecaton_no_align.nf
+++ b/nextflow/hecaton_no_align.nf
@@ -151,7 +151,7 @@ process call_lumpy {
 	-x ${genome_N_file} \
 	-B ${alignment_file} \
 	-R ${genome_file} \
-	-m ${task.cpus} &&
+	-m 1 &&
 	vcf_to_bedpe.py -i ${prefix}.sv.vcf.gz \
 	-o ${prefix}.bedpe -t LUMPY &&
 	process_simple_cnvs.py -i ${prefix}.sv.vcf.gz -o ${prefix}_post_processed.bedpe -t LUMPY &&
diff --git a/scripts/genotype/merge_vcf_files.py b/scripts/genotype/merge_vcf_files.py
index 331b026..5123847 100755
--- a/scripts/genotype/merge_vcf_files.py
+++ b/scripts/genotype/merge_vcf_files.py
@@ -96,12 +96,20 @@ def obtain_sites_and_genotypes(input_fns, heterozygous=True):
                 dhffc = record.samples[0]["DHFFC"]
                 # set genotypes based on format
                 gt = [".", "."]
+                non_calls = [(None, None), (0, 0)]
+                called_gt = record.samples[0]["GT"]
                 if not heterozygous:
                     gt[0] = "1"
                     gt[1] = "1"
                 else:
                     if sv_type == "INS":
-                        gt[1] = "1"
+                        # do not call INS if single sample shows reference only or no call
+                        if called_gt in non_calls:
+                            gt[0] = called_gt[0]
+                            gt[1] = called_gt[1]
+                        else: 
+                            # call can be only be ./1, based on available evidence
+                            gt[1] = "1"                        
                     elif sv_type == "DEL":
                         if dhfc >= 0 and dhfc < 0.25:
                             gt[0] = "1"
-- 
GitLab