From 1eb490b5e430c37fc6e32d08dc7bae7428b473b6 Mon Sep 17 00:00:00 2001 From: Raul Wijfjes <raul.wijfjes@wur.nl> Date: Mon, 16 Dec 2019 15:46:33 +0100 Subject: [PATCH] Added filtering script and fixed bugs --- CHANGELOG.md | 13 +++++- docker/Dockerfile | 2 +- nextflow/hecaton.nf | 2 +- nextflow/hecaton_filter_size_cutoff.nf | 57 ++++++++++++++++++++++++++ nextflow/hecaton_no_align.nf | 2 +- scripts/genotype/merge_vcf_files.py | 10 ++++- 6 files changed, 80 insertions(+), 6 deletions(-) create mode 100644 nextflow/hecaton_filter_size_cutoff.nf diff --git a/CHANGELOG.md b/CHANGELOG.md index d894836..842f0cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,12 +6,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## [0.3.0] - 2019-12-16 +### Added +- Script to easily filter Hecaton output (nextflow/hecaton_filter_size_cutoff.nf) + ### Changed - Added docker pull option in documentation - Corrected link to old repository in documentation -## [0.2.2] - 2019-09-20 +### Fixed +- Bug during merging in which homozygous insertions are reported for a sample, even if its VCF gives a reference or no call at that position +- Typo in hecaton.nf and hecaton_no_align.nf which caused speedseq -m parameter to be equal to the number of threads +## [0.2.2] - 2019-09-20 ### Added - Test to check if running Hecaton with pre-generated alignments works correctly @@ -47,7 +54,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Initial release of Hecaton -[Unreleased]: https://git.wur.nl/bioinformatics/hecaton/compare/v0.2.1...master +[Unreleased]: https://git.wur.nl/bioinformatics/hecaton/compare/v0.3.0...master +[0.3.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.3.0 +[0.2.2]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.2 [0.2.1]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.1 [0.2.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.2.0 [0.1.0]: https://git.wur.nl/bioinformatics/hecaton/tags/v0.1.0 diff --git a/docker/Dockerfile b/docker/Dockerfile index 021b7cc..957afb0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -71,7 +71,7 @@ RUN source activate hecaton_py2 && \ # RUN git clone https://git.wur.nl/wijfj001/hecaton.git && cd hecaton && \ # git checkout e85bba0c && cd .. && \ RUN git clone https://git.wur.nl/bioinformatics/hecaton.git && \ - echo "8a8bba7a" && \ + echo "0ba88762" && \ chmod +x hecaton/scripts/collapse/* && \ chmod +x hecaton/scripts/convert/* && \ chmod +x hecaton/scripts/filter/* && \ diff --git a/nextflow/hecaton.nf b/nextflow/hecaton.nf index 0312a41..edda155 100644 --- a/nextflow/hecaton.nf +++ b/nextflow/hecaton.nf @@ -186,7 +186,7 @@ process call_lumpy { -D ${discordants_alignment_file} \ -S ${splitters_alignment_file} \ -R ${genome_file} \ - -m ${task.cpus} && + -m 1 && vcf_to_bedpe.py -i ${prefix}.sv.vcf.gz \ -o ${prefix}.bedpe -t LUMPY && process_simple_cnvs.py -i ${prefix}.sv.vcf.gz -o ${prefix}_post_processed.bedpe -t LUMPY && diff --git a/nextflow/hecaton_filter_size_cutoff.nf b/nextflow/hecaton_filter_size_cutoff.nf new file mode 100644 index 0000000..e54b92b --- /dev/null +++ b/nextflow/hecaton_filter_size_cutoff.nf @@ -0,0 +1,57 @@ +/* + *Run the filtering on size and quality score step of Hecaton + * + *@authors + *Raúl Wijfjes <raul.wijfjes@wur.nl> + * + *Date last modified: 29-11-2019 + */ + +params.probability_files = "" +params.cutoff = 0.7 +params.output_dir = "" + +/* + * Input parameters validation and setup + */ + +if (! params.output_dir ) { + println "Missing output directory parameter" + exit 1 +} +if (! params.probability_files ) { + println "Missing probability files parameter" + exit 1 +} +probability_files = Channel.fromPath(params.probability_files) + .map { file -> tuple(getSampleId(file), file)} + .groupTuple() + +def getSampleId(file) { + file.getBaseName() +} + +/* + * Filter calls according to size and cutoff + */ + +process filter_calls_cutoff { + publishDir "${params.output_dir}", mode: 'move' + tag "Filter calls of ${prefix}. Cutoff: ${params.cutoff}" + + input: + set val(prefix), file(probability_file) from probability_files + + output: + set val(prefix), file("${prefix}_filtered_cutoff_${params.cutoff}.bedpe") + + script: + """ + source activate hecaton_py3 + filter_calls_by_query.py \ + -i ${probability_file} \ + -q \"SIZE >= 50 & PREDICTION_1 >= ${params.cutoff}\" \ + -o ${prefix}_filtered_cutoff_${params.cutoff}.bedpe + source deactivate + """ +} diff --git a/nextflow/hecaton_no_align.nf b/nextflow/hecaton_no_align.nf index 145ff95..2c6a2b1 100644 --- a/nextflow/hecaton_no_align.nf +++ b/nextflow/hecaton_no_align.nf @@ -151,7 +151,7 @@ process call_lumpy { -x ${genome_N_file} \ -B ${alignment_file} \ -R ${genome_file} \ - -m ${task.cpus} && + -m 1 && vcf_to_bedpe.py -i ${prefix}.sv.vcf.gz \ -o ${prefix}.bedpe -t LUMPY && process_simple_cnvs.py -i ${prefix}.sv.vcf.gz -o ${prefix}_post_processed.bedpe -t LUMPY && diff --git a/scripts/genotype/merge_vcf_files.py b/scripts/genotype/merge_vcf_files.py index 331b026..5123847 100755 --- a/scripts/genotype/merge_vcf_files.py +++ b/scripts/genotype/merge_vcf_files.py @@ -96,12 +96,20 @@ def obtain_sites_and_genotypes(input_fns, heterozygous=True): dhffc = record.samples[0]["DHFFC"] # set genotypes based on format gt = [".", "."] + non_calls = [(None, None), (0, 0)] + called_gt = record.samples[0]["GT"] if not heterozygous: gt[0] = "1" gt[1] = "1" else: if sv_type == "INS": - gt[1] = "1" + # do not call INS if single sample shows reference only or no call + if called_gt in non_calls: + gt[0] = called_gt[0] + gt[1] = called_gt[1] + else: + # call can be only be ./1, based on available evidence + gt[1] = "1" elif sv_type == "DEL": if dhfc >= 0 and dhfc < 0.25: gt[0] = "1" -- GitLab