Added the option to exclude certain chromosomes

6a857caa · Wijfjes, Raul · d48739a1 · 6a857caa · 6a857caa · 6a857caa
Commit 6a857caa authored 5 years ago by Wijfjes, Raul
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.2.0] - 2019-04-25
 ### Added
 - Hecaton can now be run with pre-generated BWA mem alignments
+- Certain chromosomal regions can now be excluded from CNV calling

 ## [0.1.0] - 2019-03-16
 ### Added

--- a/docker/environment_py2.yml
+++ b/docker/environment_py2.yml
@@ -4,7 +4,6 @@ name: hecaton_py2
 channels:
  - bioconda
  - conda-forge
-  - defaults
 dependencies:
  # python
  - python=2.7

--- a/docker/environment_py3.yml
+++ b/docker/environment_py3.yml
@@ -4,7 +4,6 @@ name: hecaton_py3
 channels:
  - bioconda
  - conda-forge
-  - r
  - defaults
 dependencies:
  # python

--- a/nextflow/hecaton.nf
+++ b/nextflow/hecaton.nf
@@ -9,6 +9,7 @@

 params.genome_file = ""
 params.reads = "prefix_{1,2}.fastq"
+params.alignment_exclude = 'NO_EXCLUDE'
 params.model_file = ""
 params.cutoff = 0.7
 params.extra_filtering = false
@@ -36,6 +37,7 @@ def helpMessage() {
    --output_dir: Output directory to which all results will be written
    
    Optional arguments:
+    --alignment_exclude: BED file containing chromosomal regions that will be excluded from CNV calling
    -w: Working directory to which intermediate results will be written. Default: work
    -c: Config file specifying the number of CPU cores and memory that will be assigned to Hecaton
    """.stripIndent()
@@ -80,6 +82,7 @@ genome_N_file = file(params.genome_file + ".N.bed")
 genome_bed = file(params.genome_file + ".genome")
 manta_config_file = file(params.manta_config)
 model_file = file(params.model_file)
+exclude_file = file(params.alignment_exclude)

 /*
 * Align reads to reference genome using speedseq
@@ -98,6 +101,7 @@ process align_reads {
 	file genome_bwa_bwt_file
 	file genome_bwa_pac_file
 	file genome_bwa_sa_file
+	file exclude_file

 	output:
 	set val(prefix), file("${prefix}.bam") into bwa_bams
@@ -108,12 +112,32 @@ process align_reads {
 	set val(prefix), file("${prefix}.splitters.bam.bai") into bwa_splitters_bam_indices

 	script:
-	"""
-	source activate hecaton_py3
-	speedseq align -t ${task.cpus} -o ${prefix} -R \"@RG\tID:${prefix}\tSM:${prefix}\tLB:${prefix}\" \
-	${genome_file} ${reads[0]} ${reads[1]}
-	source deactivate
-	"""
+	if( exclude_file.name == 'NO_EXCLUDE' ) 
+		"""
+		source activate hecaton_py3
+		speedseq align -t ${task.cpus} -o ${prefix} -R \"@RG\tID:${prefix}\tSM:${prefix}\tLB:${prefix}\" \
+		${genome_file} ${reads[0]} ${reads[1]}
+		source deactivate
+		"""
+	else
+		"""
+		source activate hecaton_py3
+		speedseq align -t ${task.cpus} -o ${prefix} -R \"@RG\tID:${prefix}\tSM:${prefix}\tLB:${prefix}\" \
+		${genome_file} ${reads[0]} ${reads[1]} && \
+		samtools view -h -b -L ${exclude_file} ${prefix}.bam > ${prefix}_excluded.bam && \
+		samtools index ${prefix}_excluded.bam && \
+		mv ${prefix}_excluded.bam ${prefix}.bam && \
+		mv ${prefix}_excluded.bam.bai ${prefix}.bam.bai && \
+		samtools view -h -b -L ${exclude_file} ${prefix}.discordants.bam > ${prefix}.discordants_excluded.bam && \
+		samtools index ${prefix}.discordants_excluded.bam && \
+		mv ${prefix}.discordants_excluded.bam ${prefix}.discordants.bam && \
+		mv ${prefix}.discordants_excluded.bam.bai ${prefix}.discordants.bam.bai && \
+		samtools view -h -b -L ${exclude_file} ${prefix}.splitters.bam > ${prefix}.splitters_excluded.bam && \
+		samtools index ${prefix}.splitters_excluded.bam && \
+		mv ${prefix}.splitters_excluded.bam ${prefix}.splitters.bam && \
+		mv ${prefix}.splitters_excluded.bam.bai ${prefix}.splitters.bam.bai && \
+		source deactivate
+		"""
 }

 /*

--- a/nextflow/hecaton_no_align.nf
+++ b/nextflow/hecaton_no_align.nf
@@ -9,6 +9,7 @@

 params.genome_file = ""
 params.bwa_bams = "*.bam"
+params.alignment_exclude = 'NO_EXCLUDE'
 params.model_file = ""
 params.cutoff = 0.7
 params.extra_filtering = false
@@ -36,6 +37,7 @@ def helpMessage() {
    --output_dir: Output directory to which all results will be written
    
    Optional arguments:
+    --alignment_exclude: BED file containing chromosomal regions that will be excluded from CNV calling
    -w: Working directory to which intermediate results will be written. Default: work
    -c: Config file specifying the number of CPU cores and memory that will be assigned to Hecaton
    """.stripIndent()
@@ -80,13 +82,50 @@ genome_file = file(params.genome_file)
 genome_index_file = file(params.genome_file + ".fai")
 genome_N_file = file(params.genome_file + ".N.bed")
 genome_bed = file(params.genome_file + ".genome")
+manta_config_file = file(params.manta_config)
 model_file = file(params.model_file)
+exclude_file = file(params.alignment_exclude)
+
+/*
+ * Exclude chromosomal regions
+ */
+
+process exclude_reads {
+	tag "Excluding reads"
+
+	input:
+	set val(prefix), file(alignment_file), file(alignment_file_index) from bwa_bams
+	file exclude_file
+
+	output:
+	set val(prefix), file("${prefix}.bam"), file("${prefix}.bam.bai") into bwa_bams_excluded
+
+	script:
+	if( exclude_file.name != 'NO_EXCLUDE' ) 
+		"""
+		export PS1=\${PS1:-''}
+		source ~/.bashrc
+		conda activate hecaton_py3
+		samtools view -h -b -L ${exclude_file} ${alignment_file} > ${prefix}_excluded.bam && \
+		samtools index ${prefix}_excluded.bam && \
+		mv ${prefix}_excluded.bam ${prefix}.bam && \
+		mv ${prefix}_excluded.bam.bai ${prefix}.bam.bai && \
+		conda deactivate
+		"""
+	else
+		"""
+		export PS1=\${PS1:-''}
+		source ~/.bashrc
+		mv ${alignment_file} ${prefix}.bam
+		mv ${alignment_file_index} ${prefix}.bam.bai
+		"""
+}

 /*
 * Create channels for all callers
 */

-bwa_bams.into{delly_bams; gridss_bams; lumpy_bams; manta_bams; duphold_bams}
+bwa_bams_excluded.into{delly_bams; gridss_bams; lumpy_bams; manta_bams; duphold_bams}

 /*
 * Call CNVs with LUMPY
@@ -94,6 +133,7 @@ bwa_bams.into{delly_bams; gridss_bams; lumpy_bams; manta_bams; duphold_bams}

 process call_lumpy {
 	label "multithreading"
+	label "lumpy"
 	publishDir "${params.output_dir}/lumpy_calls", mode: 'copy'
 	tag "LUMPY calling: ${prefix}"

@@ -110,7 +150,9 @@ process call_lumpy {

 	script:
 	"""
-	source activate hecaton_py3 &&
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3 &&
 	speedseq sv -t ${task.cpus} -o ${prefix} \
 	-x ${genome_N_file} \
 	-B ${alignment_file} \
@@ -127,7 +169,7 @@ process call_lumpy {
 	-b ${prefix}_post_processed.bedpe \
 	-f ${genome_index_file} \
 	-o ${prefix}_post_processed_collapsed.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -138,6 +180,7 @@ process call_lumpy {
 process call_delly {
 	publishDir "${params.output_dir}/delly_calls", mode: 'copy'
 	tag "DELLY calling: ${prefix}"
+	label "delly"

 	input:
 	set val(prefix), file(alignment_file), file(alignment_file_index) from delly_bams
@@ -152,7 +195,9 @@ process call_delly {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
 	delly call -g ${genome_file} -o ${prefix}_delly_cnvs.bcf ${alignment_file} &&
 	bcftools view ${prefix}_delly_cnvs.bcf > ${prefix}_delly_cnvs.vcf &&
 	bgzip -f ${prefix}_delly_cnvs.vcf &&
@@ -169,7 +214,7 @@ process call_delly {
 	-b ${prefix}_post_processed.bedpe \
 	-f ${genome_index_file} \
 	-o ${prefix}_post_processed_collapsed.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -179,6 +224,7 @@ process call_delly {

 process call_gridss {
 	label "multithreading"
+	label "gridss"
 	publishDir "${params.output_dir}/gridss_calls", mode: 'copy'
 	tag "GRIDSS calling: ${prefix}"

@@ -192,7 +238,9 @@ process call_gridss {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
 	java -ea -Xmx31g \
 	-Dreference_fasta=${genome_file} \
 	-Dsamjdk.create_index=true \
@@ -217,7 +265,7 @@ process call_gridss {
 	-b ${prefix}_post_processed.bedpe \
 	-f ${genome_index_file} \
 	-o ${prefix}_post_processed_collapsed.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -229,6 +277,7 @@ process call_manta {
 	label "multithreading"
 	publishDir "${params.output_dir}/manta_calls", mode: 'copy'
 	tag "Manta calling: ${prefix}"
+	label "manta"

 	input:
 	set val(prefix), file(alignment_file), file(alignment_file_index) from manta_bams
@@ -243,7 +292,9 @@ process call_manta {

 	script:
 	"""
-	source activate hecaton_py2
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py2
 	configManta.py --tumorBam ${alignment_file} \
 	--config ${manta_config_file} \
 	--referenceFasta ${genome_file} --runDir ${prefix}_rundir &&
@@ -252,6 +303,8 @@ process call_manta {
 	mv results/variants/tumorSV.vcf.gz ../${prefix}_tumorSV.vcf.gz &&
 	mv results/variants/tumorSV.vcf.gz.tbi ../${prefix}_tumorSV.vcf.gz.tbi &&
 	cd .. &&
+	conda deactivate
+	conda activate hecaton_py3
 	vcf_to_bedpe.py -i ${prefix}_tumorSV.vcf.gz \
 	-o ${prefix}.bedpe -t Manta &&
 	process_simple_cnvs.py -i ${prefix}_tumorSV.vcf.gz \
@@ -264,7 +317,7 @@ process call_manta {
 	-b ${prefix}_post_processed.bedpe \
 	-f ${genome_index_file} \
 	-o ${prefix}_post_processed_collapsed.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -287,7 +340,9 @@ process intersecting_calls {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
 	intersecting_bedpe_intervals.py \
 	-a lumpy_file.bedpe \
 	-b manta_file.bedpe \
@@ -303,7 +358,7 @@ process intersecting_calls {
 	-b gridss_file.bedpe \
 	-f ${genome_index_file} \
 	-o ${prefix}_intersected.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -324,11 +379,13 @@ process prepare_features {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
 	intersect_bedpe_to_feature_bedpe.py -i ${intersection_file} \
 	-o ${prefix}_features.bedpe
 	split_feature_set_non_insertions.py -f ${prefix}_features.bedpe -i ${prefix}_insertion_features.bedpe -n ${prefix}_non_insertion_features.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -349,7 +406,9 @@ process apply_random_forest {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
 	predict_cnvs_model_insertions_probabilities.py \
 	-i ${insertion_file} \
    -m ${model_file} \
@@ -362,7 +421,7 @@ process apply_random_forest {
 	-i ${prefix}_insertion_probabilities.bedpe \
 	-n ${prefix}_non_insertion_probabilities.bedpe \
 	-o ${prefix}_probabilities_unfiltered.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -382,12 +441,14 @@ process filter_calls_cutoff {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
    filter_calls_by_query.py \
    -i ${probability_file} \
    -q \"SIZE >= 50 & PREDICTION_1 >= ${params.cutoff}\" \
    -o ${prefix}_probabilities_filtered_cutoff_${params.cutoff}.bedpe
-    source deactivate
+    conda deactivate
 	"""
 }

@@ -412,7 +473,9 @@ process filter_calls_median_depth {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
 	bedpe_to_vcf.py \
 	-i ${probability_file} \
 	-o ${prefix}.vcf \
@@ -430,7 +493,7 @@ process filter_calls_median_depth {
 	-i ${prefix}_probabilities_duphold.bedpe \
 	-q \"(Chrom_norm_depth < 4 & GC_norm_depth < 4 & Flank_norm_depth < 4) & ((Chrom_norm_depth < 0.7 & GC_norm_depth < 0.7 & Flank_norm_depth < 0.7 & TYPE == 'DEL') | (Chrom_norm_depth > 1.3 & GC_norm_depth > 1.3 & Flank_norm_depth > 1.3 & (TYPE == 'DUP:TANDEM' | TYPE == 'DUP:DISPERSED')) | TYPE == 'INS')\" \
 	-o ${prefix}_probabilities_filtered_cutoff_${params.cutoff}_depth.bedpe
-	source deactivate
+	conda deactivate
 	"""
 }

@@ -453,7 +516,9 @@ process filter_calls_flanking_Ns {

 	script:
 	"""
-	source activate hecaton_py3
+	export PS1=\${PS1:-''}
+	source ~/.bashrc
+	conda activate hecaton_py3
 	breakpoint_slop.py -i ${probability_file} \
 	-o tmp_${prefix}_slop_200.bedpe -s 200 && \
 	bedtools flank -i tmp_${prefix}_slop_200.bedpe \
@@ -471,6 +536,6 @@ process filter_calls_flanking_Ns {
 	-i ${prefix}_probabilities_flanking_Ns.bedpe \
 	-q \"Flanking_Ns < 0.1\" \
 	-o ${prefix}_probabilities_filtered_cutoff_${params.cutoff}_depth_flanking_Ns.bedpe \
-	source deactivate
+	conda deactivate
 	"""
 }