Commit da7a0666 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

dedup

parent f7a0f9be
......@@ -7,3 +7,9 @@
*.png
*.html
*.out
*/db
*/in
*/output
*/map
*.fa
scons/
#in fasta in csv
#sa1/sa.fa sa1/sa.csv
#sa2/sa.fa sa2/sa.csv
sa3/sa.fa sa3/sa.csv
sa1/sa.fa sa1/sa.csv
sa2/sa.fa sa2/sa.csv
#sh1/sh.fa sh1/sh.csv
#sh2/sh.fa sh2/sh.csv
sh3/sh.fa sh3/sh.csv
sh1/sh.fa sh1/sh.csv
sh2/sh.fa sh2/sh.csv
#sl1/sl.fa sl1/sl.csv
sl2/sl.fa sl2/sl.csv
sl1/sl.fa sl1/sl.csv
#sp1/sp.fa sp1/sp.csv
#sp2/sp.fa sp2/sp.csv
sp1/sp.fa sp1/sp.csv
sp2/sp.fa sp2/sp.csv
sp3/sp.fa sp3/sp.csv
heinz/heinz.fa heinz/heinz.csv
pimp/heinz.fa pimp/pimp.csv
#group name platform library name library size library size min library size max file names
30507AAXX_6 illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/30507AAXX_6_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/30507AAXX_6_f.fastq
30507AAXX_7 illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/30507AAXX_7_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/30507AAXX_7_f.fastq
30507AAXX_8 illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/30507AAXX_8_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/30507AAXX_8_f.fastq
314TYAAXX_1 illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_1_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_1_r.fastq
314TYAAXX_2 illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_2_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_2_r.fastq
314TYAAXX_3 illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_3_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_3_r.fastq
314TYAAXX_4 illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_4_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_250/314TYAAXX_4_r.fastq
30507AAXX_1 illumina pe 215 193 237 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_1_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_1_f.fastq
30507AAXX_2 illumina pe 215 193 237 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_2_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_2_f.fastq
30507AAXX_3 illumina pe 215 193 237 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_3_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_3_f.fastq
30507AAXX_4 illumina pe 215 193 237 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_4_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/30507AAXX_4_f.fastq
314TYAAXX_6 illumina pe 215 193 237 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/314TYAAXX_6_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/314TYAAXX_6_r.fastq
314TYAAXX_7 illumina pe 215 193 237 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/314TYAAXX_7_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/314TYAAXX_7_r.fastq
314TYAAXX_8 illumina pe 215 193 237 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/314TYAAXX_8_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_300/314TYAAXX_8_r.fastq
#sample_R illumina pe 500 450 550 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_500/sample_R1_001.64.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/Heinz_Illumina_pairedend_500/sample_R2_001.64.fastq
#3018DAAXX_1 illumina mp 1640 1440 1840 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_2000/3018DAAXX_1_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_2000/3018DAAXX_1_f.fastq
#3018DAAXX_2 illumina mp 2430 2180 2680 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_3000/3018DAAXX_2_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_3000/3018DAAXX_2_r.fastq
#3018DAAXX_3 illumina mp 2850 2550 3150 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_4000/3018DAAXX_3_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_4000/3018DAAXX_3_f.fastq
#3018DAAXX_4 illumina mp 3000 2500 3500 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_5000/3018DAAXX_4_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_5000/3018DAAXX_4_f.fastq
#30THBAAXX_1 illumina mp 1640 1440 1840 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_2000/30THBAAXX_1_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_2000/30THBAAXX_1_r.fastq
#30THBAAXX_2 illumina mp 2430 2180 2680 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_3000/30THBAAXX_2_r.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_3000/30THBAAXX_2_f.fastq
#30THBAAXX_3 illumina mp 2850 2550 3150 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_4000/30THBAAXX_3_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_4000/30THBAAXX_3_r.fastq
#30THBAAXX_4 illumina mp 3000 2500 3500 /home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_5000/30THBAAXX_4_f.fastq;/home/assembly/tomato150/dev150/assemblies/mapping/heinz/in/MP_5000/30THBAAXX_4_r.fastq
($num,$den,$min,$max,$av,$var)=(0,0,10000000,0,0,0);
while ($cov=<STDIN>) {
if ( $cov >= 1 ) {
$num += $cov;
if ( $cov > $max ) { $max = $cov; };
if ( $cov < $min ) { $min = $cov; };
if ( $den == 0 ) { $av=$cov; $var=0; }
else {
$av = (($av*$den)+$cov)/($den+1);
$var += ($cov-$av)**2;
$dev = sqrt($var/$den);
if ( $den % 10000 == 0 ) {
print "$den\t$av\t$var\t$dev\n";
}
};
$den++;
}
}
$cov=$num/$den;
$dev=sqrt($var/$den);
$mx=$cov+(2*$dev);
print "Length = $den\n";
print "Bases = $num\n";
print "Mean Coverage = $cov\n";
print "Min Coverage = $min\n";
print "Max Coverage = $max\n";
print "Var Coverage = $var\n";
print "Dev Coverage = $dev\n";
print "MX Coverage = $mx\n";
./SConstruct.run --incsv=all3.csv --threads=10 -j 5 heinz
#http://samtools.sourceforge.net/mpileup.shtml
For variant calling by Samtools, we used following commands.
samtools calmd | samtools mpileup | bcftools view: to create raw variant bcf files
Options "-AEur" used for calmd.
generates extra tags (MD tags) with high sensitivity (-Ar), capping it (I dont know what it means) and exporting as uncompresse (-u) bam.
Options "-d 99999 -L 99999 -Dugf" were used for mpileup.
For each position read at most 99999 reads (-d), dont skip indels (-L 99999), output per sample read depth (-D), export as uncompressed BCF (-ug)
Options "-bvcg" were used for view.
output as BCF (-b) showing only variant sites (-vc) for each sample (-g)
vcfutils.pl varFilter: to filter and create variant vcf files
Options "-W 5 -Q 20 -d 4 -D xxx" were used. Upper depth (-D) of each sample was set based on depth distribution of the sample.
Filter adjacent gaps with a window size of 5 (-W) minimum RMS quality of 20 (-Q, i dont know what it means), minimum depth of 4 (-d) and a custom maximum depth depending on the actual depth (i guess average)
/usr/share/samtools/vcfutils.pl
INBAM1=output/heinz.fa.bam.filtered.bam
RUNNAME=output/heinz.fa.bam.filtered.bam
REF=heinz.fa
samtools rmdup $INBAM1 $RUNNAME.dedup.bam
samtools calmd -AEur $RUNNAME.dedup.bam $REF | samtools mpileup -d 99999 -L 99999 -Dugf $REF - | bcftools view -bvcg - > $RUNNAME.dedup.bam.bcf
bcftools view $RUNNAME.dedup.bam.bcf | /usr/share/samtools/vcfutils.pl varFilter -W 5 -Q 20 -d 4 -D100 > $RUNNAME.dedup.bam.bcf.vcf
samtools mpileup $RUNNAME.dedup.bam | awk '{print $4}' | perl mean_coverage.pl
cat output/heinz.fa.bam.filtered.bam.dedup.bam.bcf.vcf | grep -v \# | cut -f 8 | cut -d ';' -f 1 | grep -v INDEL | cut -d '=' -f 2 | perl -ne 'BEGIN{$v=0;$c=0;$min=10000000;$max=0;$av=0;$var=0;} END{ $a=$v/$c; $dev=sqrt($var/$c); $mx=$a+(2*$dev); print "$c\t$v\t$a\t$min\t$max\t$av\t$var\t$dev\t$mx\n"; } $c+=1; $v+=$_; if ($_ < $min) { $min = $_; }; if ($_ > $max) { $max = $_; }; if ($c==1) { $av = $_; $var=0; } else { $av=(($av*$c)+$_)/($c+1); $var+=($_-$av)**2; $dev=sqrt($var/$c); print "$av\t$var\t$dev\n";}'
heinz dedup : BAM : Length = 725,678,368
heinz dedup : BAM : Bases = 15,270,968,181
heinz dedup : BAM : Mean Coverage = 21.04
heinz dedup : BAM : Min Coverage = 1
heinz dedup : BAM : Max Coverage = 8,010
heinz dedup : BAM : Var Coverage = 934,577,172,304.24
heinz dedup : BAM : Dev Coverage = 35.89
heinz dedup : BAM : MX Coverage = 92.82
heinz dedup : VCF : POSITION = 370,742
heinz dedup : VCF : SUM COV = 7,425,739
heinz dedup : VCF : AVG COV = 20.03
heinz dedup : VCF : MIN COV = 4
heinz dedup : VCF : MAX COV = 100
heinz dedup : VCF : VAR COV = 75,974,225.29
heinz dedup : VCF : STDDEV COV = 14.31
heinz dedup : VCF : 2DEV COV = 48.66
RF_001 : BAM : Length = 733,900,334
RF_001 : BAM : Bases = 28,464,215,562
RF_001 : BAM : Mean Coverage = 38.78
RF_001 : BAM : Min Coverage = 1
RF_001 : BAM : Max Coverage = 7,861
RF_001 : BAM : Var Coverage = 1,355,682,620,730.03
RF_001 : BAM : Dev Coverage = 42.98
RF_001 : BAM : MX Coverage = 124.74
RF_001 : VCF : POSITION = 237,532
RF_001 : VCF : SUM COV = 10,093,075
RF_001 : VCF : AVG COV = 42.49
RF_001 : VCF : MIN COV = 4
RF_001 : VCF : MAX COV = 105
RF_001 : VCF : VAR COV = 97,797,311.67
RF_001 : VCF : STDDEV COV = 20.29
RF_001 : VCF : 2DEV COV = 82.58
samtools mpileup ~/tomato150/Disk02_bam_ass_nobackup/reseq/mapped/Heinz/RF_001_SZAXPI008746-45.bam | awk '{print $4}' | perl mean_coverage.pl | tee RF_001.bam.cov_stats
gunzip -c ~/tomato150/programs/snpeff/newest/datain/RF_001_SZAXPI008746-45.vcf.gz | grep -v \# | cut -f 8 | cut -d ';' -f 1 | grep -v INDEL | cut -d '=' -f 2 | perl -ne 'BEGIN{$v=0;$c=0;$min=10000000;$max=0;$av=0;$var=0;} END{ $a=$v/$c; $dev=sqrt($var/$c); $mx=$a+(2*$dev); print "$c\t$v\t$a\t$min\t$max\t$av\t$var\t$dev\t$mx\n"; } $c+=1; $v+=$_; if ($_ < $min) { $min = $_; }; if ($_ > $max) { $max = $_; }; if ($c==1) { $av = $_; $var=0; } else { $av=(($av*$c)+$_)/($c+1); $var+=($_-$av)**2; $dev=sqrt($var/$c); print "$av\t$var\t$dev\n";}'
wc -l *.vcf
410.804 heinz.fa.bam.filtered.bam.dedup.bam.bcf.vcf
200.015 heinz.fa.bam.filtered.bam.dedup.bam.bcf.vcf.qual20.vcf
610.819 total
=========================================================================================
($num,$den,$min,$max,$av,$var)=(0,0,10000000,0,0,0);
while ($cov=<STDIN>) {
if ( $cov >= 1 ) {
$c += 1;
$num += $cov;
if ( $cov > $max ) { $max = $cov; };
if ( $cov < $min ) { $min = $cov; };
if ( $c == 0 ) { $av=$cov; $var=0; }
else {
$av = (($av*$den)+$cov)/($den+1);
$var += ($cov-$av)**2;
$dev = sqrt($var/$den);
if ( $den % 10000 == 0 ) {
print "$den\t$av\t$var\t$dev\n";
}
};
$den++;
}
}
$cov=$num/$den;
$dev=sqrt($var/$c);
$mx=$cov+(2*$dev);
print "Length = $den\n";
print "Bases = $num\n";
print "Mean Coverage = $cov\n";
print "Min Coverage = $min\n";
print "Max Coverage = $max\n";
print "Var Coverage = $var\n";
print "Dev Coverage = $dev\n";
print "MX Coverage = $mx\n";
#!/bin/bash
set -xeu
WID=800
HEI=600
NUM=9
F1=$1
F2=$2
if [[ ! -f "white.png" ]]; then
convert -size ${WID}x${HEI} xc:white white.png
fi
function do_report {
reportfolderOut=$1
reportfolder=${reportfolderOut}_fastqc
if [[ ! -f "$reportfolder/Images/kmer_profiles.png" ]]; then
#kmer_profiles.png
cp white.png $reportfolder/Images/kmer_profiles.png
fi
images=`find $reportfolder/Images -type f -name '*.png' | sort | tr '\n' ' '`
#echo " IMAGES $images"
IMC=`find $reportfolder/Images -type f -name '*.png' | wc -l`
if [[ "$IMC" -ne "$NUM" ]]; then
echo "wrong number of images $IMC (should be $NUM)"
exit 1
fi
if [[ -f "$reportfolderOut.png" ]]; then
#rm $reportfolder.png
echo -n
fi
WIDT=$(($WID*1 ))
HEIT=$(($HEI*$NUM))
CMD="montage -verbose -label '%f' -font Helvetica -pointsize 14 -background 'white' -fill 'black' -define png:size=${WIDT}x${HEIT} -geometry ${WID}x${HEI}+2+2 -tile 1x$NUM -auto-orient $images $reportfolderOut.png"
echo $CMD
eval $CMD
}
N1=${F1%%.*}
N2=${F2%%.*}
O1=$N1.png
O2=$N2.png
do_report $N1
do_report $N2
LCN=`python -c "import os,sys; print os.path.commonprefix([sys.argv[1], sys.argv[2]])" $N1 $N2`.png
LCN=`basename $LCN`
WIDT=$(($WID*2 ))
HEIT=$(($HEI*$NUM))
CMD="montage -verbose -label '%f' -font Helvetica -pointsize 14 -background 'white' -fill 'black' -define png:size=${WIDT}x${HEIT} -geometry ${WID}x${HEIT}+2+2 -tile 2x1 -auto-orient $O1 $O2 $LCN"
echo " LCN $LCN"
echo " width $WID height $HEI num $NUM images 2"
echo " width total $WIDT height total $HEIT"
#echo " CMD $CMD"
eval $CMD
../heinz/mean_coverage.pl
\ No newline at end of file
#group name platform library name library size library size min library size max file names
C3VVEACXX illumina pe 160 144 176 /home/assembly/tomato150/dev150/assemblies/mapping/pimp/in/pimpinellifolium_NoIndex_L003_R1_001.fastq.gz;/home/assembly/tomato150/dev150/assemblies/mapping/pimp/in/pimpinellifolium_NoIndex_L003_R2_001.fastq.gz
./SConstruct.run --incsv=all3.csv --threads=10 -j 5 pimp
INBAM1=/home/assembly/tomato150/dev150/assemblies/mapping/pimp/output/C3VVEACXX.bam.filtered.bam
RUNNAME=/home/assembly/tomato150/dev150/assemblies/mapping/pimp/output/C3VVEACXX.bam.filtered.bam
REF=heinz.fa
samtools rmdup $INBAM1 $RUNNAME.dedup.bam
samtools calmd -AEur $RUNNAME.dedup.bam $REF | samtools mpileup -d 99999 -L 99999 -Dugf $REF - | bcftools view -bvcg - > $RUNNAME.dedup.bam.bcf
bcftools view $RUNNAME.dedup.bam.bcf | /usr/share/samtools/vcfutils.pl varFilter -W 5 -Q 20 -d 4 -D100 > $RUNNAME.dedup.bam.bcf.vcf
samtools mpileup $RUNNAME.dedup.bam | awk '{print $4}' | perl mean_coverage.pl
cat $RUNNAME.dedup.bam.bcf.vcf | \
grep -v \# | \
cut -f 8 | \
cut -d ';' -f 1 | \
grep -v INDEL | \
cut -d '=' -f 2 | \
perl -ne 'BEGIN{$v=0;$c=0;$min=10000000;$max=0;$av=0;$var=0;} END{ $a=$v/$c; $dev=sqrt($var/$c); $mx=$a+(2*$dev); print "$c\t$v\t$a\t$min\t$max\t$av\t$var\t$dev\t$mx\n"; } $c+=1; $v+=$_; if ($_ < $min) { $min = $_; }; if ($_ > $max) { $max = $_; }; if ($c==1) { $av = $_; $var=0; } else { $av=(($av*$c)+$_)/($c+1); $var+=($_-$av)**2; $dev=sqrt($var/$c); print "$av\t$var\t$dev\n";}' | \
tee $RUNNAME.dedup.bam.bcf.vcf.stats
samtools index $RUNNAME.dedup.bam
samtools depth $RUNNAME.dedup.bam > $RUNNAME.dedup.bam.cov
samtools idxstats $RUNNAME.dedup.bam > $RUNNAME.dedup.bam.idxflags
samtools flagstat $RUNNAME.dedup.bam > $RUNNAME.dedup.bam.stats
../heinz/snpcalling.txt
\ No newline at end of file
......@@ -9,6 +9,8 @@ import glob
import atexit
from pprint import pprint as pp
#./SConstruct.runReal --incsv=all3.csv --threads=20 -j 5 all
dry_run = False
#dry_run = True
......@@ -255,7 +257,7 @@ def addCommand(name, cmd, TARGET=None, SOURCE=[], DATA={}, DEPS=[], REQS=[], PRE
valStr = " ".join([str(x) for x in val])
valindex = data.index(val)
data[valindex] = valStr
print " CONVERTING %s FROM %s TO %s" % (key, val, valStr)
print " CONVERTING FROM %s TO %s" % (val, valStr)
#print "checking target"
......@@ -462,13 +464,13 @@ def main():
os.makedirs(out_folder)
projsetup = {
'proj_name' : proj_name,
'in_fasta' : in_fasta,
'in_csv' : in_csv,
'base_folder': base_folder,
'db_folder' : db_folder,
'map_folder' : map_folder,
'out_folder' : out_folder
'proj_name' : proj_name,
'in_fasta' : in_fasta,
'in_csv' : in_csv,
'base_folder': base_folder,
'db_folder' : db_folder,
'map_folder' : map_folder,
'out_folder' : out_folder
}
pp(projsetup)
setup.append(projsetup)
......@@ -488,8 +490,8 @@ def main():
print "target %s: %s" % (projsetup['proj_name'].upper(), projsetup['outfile'])
print "\n\n\n"
def project( setup ):
def project( setup ):
datas = {}
##########################
......@@ -608,10 +610,10 @@ def process454(data, setup):
out_fastq = out_prefix + '.fastq'
out_fastq_0 = out_fastq.replace('.fastq', '_0.fastq')
out_fastq_f = out_fastq.replace('.fastq', '_1.fastq')
out_fastq_r = out_fastq.replace('.fastq', '_2.fastq')
out_fastq_n = out_fastq.replace('.fastq', '_n.fastq')
out_fastq_0 = out_fastq.replace('.fastq', '_0.fastq')
data[comm_name]['file_names' ] = [out_fastq_f, out_fastq_r]
data[comm_name]['file_names_454' ] = infile
......@@ -758,16 +760,22 @@ def run_bwa(data, setup):
CMD_MAPPER1 = "%(bwa_exe)s aln -t %(threads)d %(db_name)s %(fastq1cmd)s > %(TARGET)s"
CMD_MAPPER2 = "%(bwa_exe)s aln -t %(threads)d %(db_name)s %(fastq2cmd)s > %(TARGET)s"
CMD_ALIGNER = "%(bwa_exe)s sampe -o 20 -P -f %(TARGET)s "
CMD_ALIGNER = "%(bwa_exe)s sampe -o 20 -P "
#CMD_ALIGNER = "%(bwa_exe)s sampe -o 20 -P -f %(TARGET)s "
if 'gsize' in info:
CMD_ALIGNER += " -a %(gmax)s "
CMD_ALIGNER += "%(db_name)s %(sai1)s %(sai2)s %(fastq1cmd)s %(fastq2cmd)s"
CMD_CONVERT = "%(samtools_exe)s view -S -b -q 30 -1 %(sam)s > %(TARGET)s" # TARGET = bam_tmp
CMD_ALIGNER += " | %(samtools_exe)s view -Su -b -q 30 -1 -"
#CMD_CONVERT = "%(samtools_exe)s view -S -b -q 30 -1 %(sam)s > %(TARGET)s" # TARGET = bam_tmp
#CMD_MV1 = "mv %(bam_tmp_rnd)s %(bam_tmp)s"
CMD_SORT = "%(samtools_exe)s sort -m 53687091200 -o -l 1 -@ 20 -m 2G %(bam_tmp)s %(TARGET)s > %(TARGET)s"
CMD_ALIGNER += " | %(samtools_exe)s sort -m 53687091200 -o -l 1 -@ 20 -m 2G - %(TARGET)s > %(TARGET)s"
#CMD_SORT = "%(samtools_exe)s sort -m 53687091200 -o -l 1 -@ 20 -m 2G %(bam_tmp)s %(TARGET)s > %(TARGET)s"
#samtools sort -m 53687091200 -o -l 1 -@ 20 -m 2G 136.bam /run/shm/bwa_scons_mapping_13117_J9DFw5 > /run/shm/bwa_scons_mapping_13117_J9DFw5
#CMD_MV2 = "mv %(bam_pfx_rnd_bam)s %(bam)s"
CMD_INDEX = '%(samtools_exe)s index %(bam)s'
CMD_COV = '%(samtools_exe)s depth %(bam)s > %(TARGET)s'
CMD_STATS = '%(samtools_exe)s idxstats %(bam)s > %(TARGET)s'
......@@ -777,19 +785,21 @@ def run_bwa(data, setup):
addCommand('map' , CMD_MAPPER1, TARGET = info['sai1'] , SOURCE = info['fastq1'] , DATA = info, REQS = [ info['bwt_index'] ] , USETMP=True )
addCommand('map' , CMD_MAPPER2, TARGET = info['sai2'] , SOURCE = info['fastq2'] , DATA = info, REQS = [ info['bwt_index'] ] , USETMP=True )
addCommand('align' , CMD_ALIGNER, TARGET = info['sam'] , SOURCE = [ info['fastq1'], info['fastq2'] ] , DATA = info, DEPS = [ bwt_index, info['sai1'], info['sai2'] ], USETMP=True )
#addCommand('align' , CMD_ALIGNER, TARGET = info['sam'] , SOURCE = [ info['fastq1'], info['fastq2'] ] , DATA = info, DEPS = [ bwt_index, info['sai1'], info['sai2'] ], USETMP=True )
addCommand('align' , CMD_ALIGNER, TARGET = info['bam'] , SOURCE = [ info['fastq1'], info['fastq2'] ] , DATA = info, DEPS = [ bwt_index, info['sai1'], info['sai2'] ], USETMP=True )
CMD_CONVERT_tmp = False
if (not os.path.exists(info['bam'])) or ( os.path.getsize(info['bam']) == 0 ):
CMD_CONVERT_tmp = True
addCommand('convert' , CMD_CONVERT, TARGET = info['bam_tmp'] , SOURCE = info['sam' ] , DATA = info , USETMP=True )
#addCommand('convert' , CMD_CONVERT, TARGET = info['bam_tmp'] , SOURCE = info['sam' ] , DATA = info , USETMP=True )
#addCommand('moving' , CMD_MV1 , TARGET = info['bam_tmp' ] , SOURCE = info['bam_tmp_rnd'] , DATA = info)
CMD_SORT_tmp = False
if (not os.path.exists(info['bam'])) or ( os.path.getsize(info['bam']) == 0 ):
CMD_SORT_tmp = True
addCommand('sort' , CMD_SORT , TARGET = info['bam'] , SOURCE = info['bam_tmp' ] , DATA = info , USETMP=True)
#CMD_SORT_tmp = False
#if (not os.path.exists(info['bam'])) or ( os.path.getsize(info['bam']) == 0 ):
# CMD_SORT_tmp = True
#addCommand('sort' , CMD_SORT , TARGET = info['bam'] , SOURCE = info['bam_tmp' ] , DATA = info , USETMP=True)
#addCommand('moving' , move , TARGET = info['bam'] , SOURCE = info['bam_pfx_rnd_bam'] , DATA = info)
addCommand('index' , CMD_INDEX , TARGET = info['bai'] , SOURCE = info['bam'] , DATA = info )
......@@ -825,29 +835,30 @@ def run_bwa(data, setup):
addCommand('preoking' , CMD_PRE_OK , TARGET = pre_ok , SOURCE = oks, DATA = data )
##########################
## IF NOT TO MERGE, STOPS HERE
##########################
setup['data'] = data
if not do_merge:
print "FINISHED", setup['proj_name']
env.Alias( setup['proj_name'], pre_ok )
setup['bamdata'] = ""
setup['outfile'] = pre_ok
return [ pre_ok ]
RND_NAME1 = None
RND_NAME2 = None
bam_tmp = '%s.tmp.bam' % bam_name
bam_sht = '%s' % bam_name
bam_out = '%s.bam' % bam_name
if ( not do_merge ) or ( len(bams) == 1 ):
bam_name = bams[0]
RND_NAME1 = ""
RND_NAME2 = ""
bam_tmp = '%s.tmp.bam' % bam_name
bam_sht = '%s' % bam_name
bam_out = '%s' % bam_name
else:
RND_NAME1 = tempfile.mktemp(dir=TMP_FOLDER, prefix='bwa_', suffix='.bam')
RND_NAME2 = tempfile.mktemp(dir=TMP_FOLDER, prefix='bwa_', suffix='.bam')
bam_tmp = '%s.tmp.bam' % bam_name
bam_sht = '%s' % bam_name
bam_out = '%s.bam' % bam_name
##########################
## MERGE BAMS
##########################
bam_tmp = '%s.tmp.bam' % bam_name
bam_sht = '%s' % bam_name
bam_out = '%s.bam' % bam_name
RND_NAME1 = tempfile.mktemp(dir=TMP_FOLDER, prefix='bwa_', suffix='.bam')
RND_NAME2 = tempfile.mktemp(dir=TMP_FOLDER, prefix='bwa_', suffix='.bam')
bam_data = {
'samtools_exe': SAMTOOLS_EXE,
......@@ -864,6 +875,7 @@ def run_bwa(data, setup):
'stat' : bam_out + '.stats',
'bok' : bam_out + '.ok',
'flag' : bam_out + '.flagstats',
'filter' : bam_out + '.filtered.bam',
#'fix' : bam_out + '.filtered.bam.fixmate.bam',
#'dedup' : bam_out + '.filtered.bam.dedup.bam',
......@@ -872,6 +884,16 @@ def run_bwa(data, setup):
'fstat' : bam_out + '.filtered.bam.stats',
'fflag' : bam_out + '.filtered.bam.flagstats',
'fok' : bam_out + '.filtered.bam.ok',
'dedup' : bam_out + '.filtered.bam.dedup.bam',
#'fix' : bam_out + '.filtered.bam.fixmate.bam',
#'dedup' : bam_out + '.filtered.bam.dedup.bam',
'dindex' : bam_out + '.filtered.bam.dedup.bam.bai',
'dcov' : bam_out + '.filtered.bam.dedup.bam.cov',
'dstat' : bam_out + '.filtered.bam.dedup.bam.stats',
'dflag' : bam_out + '.filtered.bam.dedup.bam.flagstats',
'dok' : bam_out + '.filtered.bam.dedup.bam.ok',
'bams' : " ".join(bams)
}
......@@ -896,6 +918,26 @@ def run_bwa(data, setup):
##########################
## IF NOT TO MERGE, STOPS HERE
##########################
setup['data'] = data
if ( not do_merge ) or ( len(bams) == 1 ):
bam_data['bok'] = info['ok']
filterbam(bam_data)
print "FINISHED", setup['proj_name']
env.Alias( setup['proj_name'], bam_data['dok'] )
setup['bamdata'] = bam_data
setup['outfile'] = bam_data['dok']
return [ bam_data['dok'] ]
##########################
## MERGE BAMS
##########################
# attatch RG tag (source), min compression
CMD_MERGE = '%(samtools_exe)s merge -r -1 %(TARGET)s %(bams)s'
#CMD_MV1 = "mv %(bamrnd1)s %(bam)s"
......@@ -904,15 +946,6 @@ def run_bwa(data, setup):
CMD_STAT = '%(samtools_exe)s idxstats %(bam)s > %(TARGET)s'
CMD_FLAG = '%(samtools_exe)s flagstat %(bam)s > %(TARGET)s'
CMD_BOK = touch
CMD_FILTER = '%(samtools_exe)s view %(sam_filter)s -b %(bam)s > %(TARGET)s'
#CMD_MV2 = 'mv %(bamrnd2)s %(filter)s'
#CMD_FIX = '%(samtools_exe)s fixmate %(filter)s %(fix)s'
#CMD_DEDUP = '%(samtools_exe)s rmdup -S %(filter)s %(dedup)s'
CMD_FINDEX = '%(samtools_exe)s index %(filter)s'
CMD_FCOV = '%(samtools_exe)s depth %(filter)s > %(TARGET)s'
CMD_FSTAT = '%(samtools_exe)s idxstats %(filter)s > %(TARGET)s'
CMD_FFLAG = '%(samtools_exe)s flagstat %(filter)s > %(TARGET)s'
CMD_FOK = touch
CMD_MERGE_tmp = False
......@@ -930,6 +963,26 @@ def run_bwa(data, setup):
bdeps = [ bam_data['index'], bam_data['cov'], bam_data['stat'], bam_data['flag'] ]
addCommand('bam ok' , CMD_BOK , TARGET = bam_data['bok'] , SOURCE = bdeps )
filterbam(bam_data)
if __name__ == 'SCons.Script':
print "FINISHED", setup['proj_name']
env.Alias( setup['proj_name'], bam_data['dok'] )
setup['bamdata'] = bam_data
setup['outfile'] = bam_data['dok']
return [ bam_data['dok'] ]
def filterbam( bam_data ):
CMD_FILTER = '%(samtools_exe)s view %(sam_filter)s -b %(bam)s > %(TARGET)s'
#CMD_MV2 = 'mv %(bamrnd2)s %(filter)s'
#CMD_FIX = '%(samtools_exe)s fixmate %(filter)s %(fix)s'
#CMD_DEDUP = '%(samtools_exe)s rmdup -S %(filter)s %(dedup)s'
CMD_FINDEX = '%(samtools_exe)s index %(filter)s'
CMD_FCOV = '%(samtools_exe)s depth %(filter)s > %(TARGET)s'
CMD_FSTAT = '%(samtools_exe)s idxstats %(filter)s > %(TARGET)s'
CMD_FFLAG = '%(samtools_exe)s flagstat %(filter)s > %(TARGET)s'
CMD_FOK = touch
CMD_FILTER_tmp = False
if ( not os.path.exists( bam_data['filter'] )) or ( os.path.getsize(bam_data['filter']) == 0 ):
CMD_FILTER_tmp = True
......@@ -951,13 +1004,35 @@ def run_bwa(data, setup):
] , DATA = bam_data )
if __name__ == 'SCons.Script':
print "FINISHED", setup['proj_name']
env.Alias( setup['proj_name'], bam_data['fok'] )
setup['bamdata'] = bam_data
setup['outfile'] = bam_data['fok']
return [ bam_data['fok'] ]
CMD_DEDUP = '%(samtools_exe)s rmdup %(filter)s %(TARGET)s'
#CMD_MV2 = 'mv %(bamrnd2)s %(filter)s'
#CMD_FIX = '%(samtools_exe)s fixmate %(filter)s %(fix)s'
#CMD_DEDUP = '%(samtools_exe)s rmdup -S %(filter)s %(dedup)s'
CMD_DINDEX = '%(samtools_exe)s index %(filter)s'
CMD_DCOV = '%(samtools_exe)s depth %(filter)s > %(TARGET)s'
CMD_DSTAT = '%(samtools_exe)s idxstats %(filter)s > %(TARGET)s'
CMD_DFLAG = '%(samtools_exe)s flagstat %(filter)s > %(TARGET)s'
CMD_DOK = touch
CMD_DEDUP_tmp = False
if ( not os.path.exists( bam_data['dedup'] )) or ( os.path.getsize(bam_data['dedup']) == 0 ):
CMD_DEDUP_tmp = True
addCommand('deduping' , CMD_DEDUP , TARGET = bam_data['dedup'] , SOURCE = bam_data['filter'] , DATA = bam_data, DEPS = [ bam_data['fok'] ], USETMP=CMD_DEDUP_tmp )
#addCommand('moving' , CMD_MV2 , TARGET = bam_data['filter'] , SOURCE = bam_data['bamrnd2'], DATA = bam_data )
addCommand('Dindexing' , CMD_DINDEX, TARGET = bam_data['dindex'] , SOURCE = bam_data['dedup'] , DATA = bam_data )
addCommand('Dcoverage' , CMD_DCOV , TARGET = bam_data['dcov'] , SOURCE = bam_data['dedup'] , DATA = bam_data, DEPS = [ bam_data['dindex'] ], USETMP=True )
addCommand('Dstatistics' , CMD_DSTAT , TARGET = bam_data['dstat'] , SOURCE = bam_data['dedup'] , DATA = bam_data, DEPS = [ bam_data['dindex'] ], USETMP=True )
addCommand('Dflag stats' , CMD_DFLAG , TARGET = bam_data['dflag'] , SOURCE = bam_data['dedup'] , DATA = bam_data, DEPS = [ bam_data['dindex'] ], USETMP=True )
addCommand('Dok' , CMD_DOK , TARGET = bam_data['dok'] , SOURCE = [
bam_data['dindex' ],
bam_data['dcov' ],
bam_data['dstat' ],
bam_data['dflag' ],
bam_data['fok']
] , DATA = bam_data )
def bf_to_str(bf):
......