Commit 9ff205ce authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

first

parents
set -xeu
FASTQ1=$1
FASTQ2=$2
FASTQ2RC=$FASTQ2.rc.fastq
echo "FASTQ 1 $FASTQ1"
echo "FASTQ 2 $FASTQ2"
echo "FASTQ RC $FASTQ2RC"
echo
ALL=out/all
VALID=out/valid
INVALID=out/invalid
if [[ ! -f "$FASTQ1" ]]; then
echo "NO INPUT FASTQ 1"
exit 1
fi
if [[ ! -f "$FASTQ2" ]]; then
echo "NO INPUT FASTQ 2"
exit 1
fi
if [[ ! -f "$ALL" ]]; then
echo "GETTING ALL"
grep "^@" $FASTQ1 $FASTQ2 | sed 's#/[1,2]##g' | sort > $ALL
fi
#echo "GETTING RC"
#if [[ ! -f "FASTQ2RC" ]]; then
# cat $FASTQ2 | fastqreversecomp.py > $FASTQ2RC
#fi
echo "FILTERING"
if [[ ! -f "$VALID" ]]; then
scripts/run_assembly_removeDuplicateReads.pl $FASTQ1 $FASTQ2 | grep "^@" | sed 's#/[1,2]##g' | sort > $VALID
fi
echo "GENERATING INVALID LIST"
sort $ALL $VALID | uniq -u > $INVALID
echo
echo "ALL : "`wc -l $ALL`
echo "VALID : "`wc -l $VALID`
echo "INVALID : "`wc -l $INVALID`
echo
echo "DONE"
#!/bin/bash
READ1=$1
READ2=$2
MNAME=${READ1}_${READ2}
#http://compbionovice.blogspot.nl/2012/10/i-recently-ran-into-issue-with-assembly.html
#####Prepare fastq files by removing qual scores and placing the sequence as a 2nd column to the IDs.
if [[ ! -f "$READ1.noqual.txt" ]]; then
echo "converting $READ1 to fasta"
grep /1$ -A 1 $READ1 | sed '/--/d' | awk '/[0-9]$/{printf $0" ";next;}1' | sed 's#/1##g' > $READ1.noqual.txt
fi
if [[ ! -f "$READ2.noqual.txt" ]]; then
echo "converting $READ2 to fasta"
grep /2$ -A 1 $READ2 | sed '/--/d' | awk '/[0-9]$/{printf $0" ";next;}1' | sed 's#/2##g' > $READ2.noqual.txt
fi
#####Merge sequences and remove duplicates.
if [[ ! -f "${MNAME}_rmdup.txt" ]]; then
echo "merging sequences"
awk '{p=$2; getline<f} !A[$2=p $2]++' f=$READ2.noqual.txt $READ1.noqual.txt | tr " " "\t" > ${MNAME}_rmdup.txt
fi
#####Prepare read IDs to be used for searching.
if [[ ! -f "${MNAME}_rmdup_IDs.txt" ]]; then
echo "getting ids"
cut -f1 ${MNAME}_rmdup.txt > ${MNAME}_rmdup_IDs.txt
fi
if [[ ! -f "${READ1}_rmdup_IDs.txt" ]]; then
echo "creating forbidden ids for $READ1"
sed 's#$#/1#g' ${MNAME}_rmdup_IDs.txt > ${READ1}_rmdup_IDs.txt
fi
if [[ ! -f "${READ2}_rmdup_IDs.txt" ]]; then
echo "creating forbidden ids for $READ2"
sed 's#$#/2#g' ${MNAME}_rmdup_IDs.txt > ${READ2}_rmdup_IDs.txt
fi
#####Get non-duplicate fastq reads.
if [[ ! -f "${READ1}_rmdup.fastq" ]]; then
echo "filtering $READ1"
awk 'NR==FNR{A[$1]=$1;next} $1 in A{print;getline;print;getline;print;getline;print}' FS="\t" ${READ1}_rmdup_IDs.txt FS="\t" OFS="\t" ${READ1} > ${READ1}_rmdup.fastq
fi
if [[ ! -f "${READ2}_rmdup.fastq" ]]; then
echo "filtering $READ2"
awk 'NR==FNR{A[$1]=$1;next} $1 in A{print;getline;print;getline;print;getline;print}' FS="\t" ${READ2}_rmdup_IDs.txt FS="\t" OFS="\t" ${READ2} > ${READ2}_rmdup.fastq
fi
echo "DONE"
#!/usr/bin/python
import os, sys
import string
import time
import re
import argparse
import hashlib
import gzip
import multiprocessing
import Queue
from itertools import islice
printevery = 100000
bufferSize = 64 * 1024*1024
trans = string.maketrans(r'ACGTN', r'TGCAN')
rep1 = re.compile(r'/1$')
rep2 = re.compile(r'/2$')
rehomo1 = re.compile(r'(.)\1+?')
rehomo2 = r'\1'
seqs = set()
dups = []
uniqs = []
regcount = 0
start = time.time()
def openfile(infile, method, buff=bufferSize, compresslevel=1):
fhd = None
if infile.endswith('.gz'):
fhd = gzip.open(infile, method+'b', compresslevel)
else:
fhd = open(infile, method, buff)
return fhd
def main():
"""
Filters sequences for clonality.
"""
cdir = os.curdir
parser = argparse.ArgumentParser(description='filter fastq files')
parser.add_argument('-n', '--nsize' , dest='nsize' , default=-1 , metavar='N', type=int , nargs='?', help='number of reads to use [INT: default: -1 (all)]')
parser.add_argument('-d', '--dir' , dest='dir' , default=cdir, metavar='D', type=str , nargs='?', help='output dir [STR: default: .]')
parser.add_argument('-m', '--merge' , dest='merge' , default=None, metavar='M', type=str , nargs='?', help='merge output using prefix')
parser.add_argument('-s', '--single' , dest='single' , action='store_true', help='treat as single end')
parser.add_argument('-c', '--compress', dest='compress', action='store_true', help='compress in memory using md5')
parser.add_argument('-i', '--only-ids', dest='onlyid' , action='store_true', help='only export id, no sequence')
parser.add_argument('-o', '--homo' , '--homopolymer', dest='homo' , action='store_true', help='compress homopolymers')
parser.add_argument('-f', '--dry-run' , dest='dryrun' , action='store_true', help='dry run')
parser.add_argument('-z', '--gzip' , dest='gzip' , action='store_true', help='compress output to gzip')
parser.add_argument('infiles', metavar='i', nargs='+', help='input files')
args = parser.parse_args()
nsize = args.nsize
dstdir = args.dir
single = args.single
merge = args.merge
compress = args.compress
onlyid = args.onlyid
homopol = args.homo
zipfile = args.gzip
dry_run = args.dryrun
infiles = args.infiles
dstdir = os.path.abspath( dstdir )
if nsize != -1:
print "SHORTENING TO %dbp" % nsize
if compress:
print "COMPRESSING IN MEMORY"
if onlyid:
print "EXPORTING ONLY ID"
if homopol:
print "COMPRESS HOMOPOLYMERS"
if zipfile:
print "COMPRESSING OUTPUT FILE"
numfiles = len(infiles)
if (numfiles != 1) and (numfiles % 2 != 0):
if not single:
print "needs either to be 1 file, a multiple of 2 files or single (-s) reads"
print "got", numfiles
sys.exit( 1 )
paired = False
if ( numfiles % 2 == 0 ) and not single:
paired = True
for filename in infiles:
if not os.path.exists( filename ):
print "input file %s does not exists" % filename
if not os.path.exists( dstdir ):
print "Destination dir %s does not exists. creating" % dstdir
os.makedirs( dstdir )
#TODO: MULTITHREADING
# CREATE A THREAD TO THE WRITTER IF MERGED
# CREATE A THREAD TO EACH ANALIZER
# CREATE A THREAD TO DATABASE WITH A QUEUE TO EACH ANALIZER
#writterAll = writter()
global start
start = time.time()
if paired:
pairnum = 0
numpairs = numfiles / 2
for filepos in range(0, numfiles, 2):
pairnum += 1
analize(infiles[filepos], infiles[filepos+1], paired=paired,pairnum=pairnum,numpairs=numpairs,dstdir=dstdir,nsize=nsize,compress=compress,onlyid=onlyid,homopol=homopol,zipfile=zipfile,merge=merge,dry_run=dry_run)
else:
filenum = 0
for filepos in range(numfiles):
filenum += 1
analize(infiles[filepos], "" , paired=paired,filenum=filenum,numfiles=numfiles,dstdir=dstdir,nsize=nsize,compress=compress,onlyid=onlyid,homopol=homopol,zipfile=zipfile,merge=merge,dry_run=dry_run)
class writter(object):
def __init__(self):
self.files = {}
def get(self):
pass
def add(self, fn, mode, buff=bufferSize, compresslevel=1):
fhd = openfile(fn1, 'r')
self.files[fn] = {}
self.files[fn]['filehandle' ] = fhd
self.files[fn]['queues' ] = {
'write' : Queue(),
'writelines': Queue(),
'readline' : Queue(),
'close' : Queue()
}
return self.files[fn]['queues']
def readline(self, fn, numlines=None):
if numlines is not None:
return list(islice( self.files[fn][0], 4 ))
else:
return self.files[fn][0].readline()
def writelines(self, fn, lines):
self.files[fn][0].writelines( lines )
def write(self, fn, line):
self.files[fn][0].write( line )
def close(self, fn):
self.files[fn][0].close()
def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfiles=None,dstdir='.',nsize=-1,compress=False,onlyid=False,homopol=False,zipfile=False,merge=False,dry_run=False):
#def openfile(infile, method, buff=64*1024*1024, compresslevel=1):
fn1o = ""
fn1u = ""
fhd1i = None
fhd1o = None
fhd1u = None
fn2o = ""
fn2u = ""
fhd2i = None
fhd2o = None
fhd2u = None
good_files = []
bad_files = []
fhd1i = openfile(fn1, 'r')
if paired:
fhd2i = openfile(fn2, 'r')
countstr = ""
if paired and (None not in [pairnum, numpairs]):
countstr = "%3d/%3d pairs " % (pairnum, numpairs)
elif not paired and ( None not in [filenum, numfiles] ):
countstr = "%3d/%3d files " % (filenum, numfiles)
if not onlyid:
fn1bn = os.path.basename( fn1 )
fn1dst = os.path.join( dstdir, fn1bn )
if merge is not None:
fn1dst = os.path.join( dstdir, merge + '.fwd' )
fn1o = fn1dst + '.rmdup.good.fastq'
fn1u = fn1dst + '.rmdup.bad.fastq'
if zipfile:
fn1o += '.gz'
fn1u += '.gz'
mode = 'w'
if merge is not None and os.path.exists(fn1o):
mode = 'a'
if not dry_run:
fhd1o = openfile(fn1o , mode)
fhd1u = openfile(fn1u , mode)
good_files = [fn1o]
bad_files = [fn1u]
if paired:
fn2bn = os.path.basename( fn2 )
fn2dst = os.path.join( dstdir, fn2bn )
if merge is not None:
fn2dst = os.path.join( dstdir, merge + '.rev')
fn2o = fn2dst + '.rmdup.good.fastq'
fn2u = fn2dst + '.rmdup.bad.fastq'
if zipfile:
fn2o += '.gz'
fn2u += '.gz'
if not dry_run:
fhd2o = openfile(fn2o , mode)
fhd2u = openfile(fn2u , mode)
good_files.append(fn2o )
bad_files.append( fn2u )
print "Reading %s:\n "%countstr + "\n ".join( [fn1, fn2] )
if not onlyid:
print "Saving good %s:\n "%countstr + "\n ".join( good_files )
print "Saving bad %s:\n "%countstr + "\n ".join( bad_files )
bn = fn1
if fn2 != "":
bn = os.path.commonprefix([os.path.basename(x) for x in [fn1, fn2]])
bn = bn.strip('.').strip('_').strip('-')
if merge is not None:
bn = merge
bn = os.path.join( dstdir, bn )
report = bn + '.rmdup.report'
excluded = bn + '.rmdup.excluded'
uniques = bn + '.rmdup.uniques'
if zipfile:
excluded += '.gz'
uniques += '.gz'
print "Report %s:\n "%countstr + report
print "Excluded %s:\n "%countstr + excluded
print "Uniques %s:\n "%countstr + uniques
if dry_run:
return
else:
analizeData(fn1, fn1o, fn1u, fhd1i, fhd1o, fhd1u, fn2, fn2o, fn2u, fhd2i, fhd2o, fhd2u, report, excluded, uniques, countstr, paired=paired, filenum=filenum, numfiles=numfiles, dstdir=dstdir, nsize=nsize , compress=compress, onlyid=onlyid, homopol=homopol, zipfile=zipfile, merge=merge, dry_run=dry_run)
def analizeData( fn1, fn1o, fn1u, fhd1i, fhd1o, fhd1u, fn2, fn2o, fn2u, fhd2i, fhd2o, fhd2u, report, excluded, uniques, countstr, paired=False , pairnum=None , numpairs=None , filenum=None , numfiles=None , dstdir='.' , nsize=-1 , compress=False , onlyid=False , homopol=False , zipfile=False , merge=False, dry_run=False ):
if not merge:
global seqs
global dups
global uniqs
global regcount
print "cleaning sequences %s" % countstr
seqs.clear()
dups = []
uniqs = []
regcount = 0
lastcount = 0
lastReg = 0
lasttime = time.time()
EOF = False
while True:
regcount += 1
reg1 = list(islice( fhd1i, 4 ))
reg2 = None
if paired:
reg2 = list(islice( fhd2i, 4 ))
if len( reg1 ) == 0:
break
if paired and len( reg2 ) == 0:
break
#strips the /1 /2 at the end
header_name1 = reg1[0].strip()[1:]
header_name1 = rep1.sub('', header_name1)
#checks forward strand
#strip new line
seq1s = reg1[1].strip()
#cut if requested
if nsize != -1:
seq1s = seq1s[:nsize]
# colapse homopolymers
if homopol:
seq1s = rehomo1.sub(rehomo2, seq1s)
# create reverse complement
seq1src = seq1s.translate( trans )[::-1]
# find lexicaly smaller sequence
seq1min = min([seq1s, seq1src])
# compress if requested
if compress:
seq1min = hashlib.md5(seq1min).digest()
#seq1min = zlib.compress(seq1min)
# check if sequence was already present
if seq1min in seqs:
dups.append( header_name1 )
if not onlyid:
fhd1u.writelines( reg1 )
if paired:
fhd2u.writelines( reg2 )
continue
#check reverse strand
if paired:
header_name2 = reg2[0].strip()[1:]
header_name2 = rep2.sub('', header_name2)
#check if still in sync
if header_name1 != header_name2:
print "out of sync"
print header_name1
print header_name2
sys.exit( 1 )
seq2s = reg2[1].strip()
if nsize != -1:
seq2s = seq2s[:nsize]
if homopol:
seq2s = rehomo1.sub(rehomo2, seq2s)
seq2src = seq2s.translate( trans )[::-1]
seq2min = min([seq2s, seq2src])
if compress:
seq2min = hashlib.md5(seq2min).digest()
#seq2min = zlib.compress(seq2min)
if seq2min in seqs:
dups.append( header_name1 )
if not onlyid:
fhd1u.writelines( reg1 )
fhd2u.writelines( reg2 )
continue
# check for palindrome or repeats
lenseqs = len(set([seq1s, seq1src, seq2s, seq2src]))
if lenseqs != 4:
seqs.update( set([seq1min, seq2min]) )
dups.append( header_name1 )
if not onlyid:
fhd1u.writelines( reg1 )
fhd2u.writelines( reg2 )
continue
#if everything is fine, add sequence and print it to good file
seqs.update( set([seq1min, seq2min]) )
uniqs.append( header_name1 )
if not onlyid:
fhd1o.writelines( reg1 )
fhd2o.writelines( reg2 )
else:
# check for palindrome or repeats
lenseqs = len(set([seq1s, seq1src]))
if lenseqs != 2:
seqs.add( seq1min )
dups.append( header_name1 )
if not onlyid:
fhd1u.writelines( reg1 )
continue
#if everything is fine, add sequence and print it to good file
seqs.add( seq1min )
uniqs.append( header_name1 )
if not onlyid:
fhd1o.writelines( reg1 )
#print 'status'
if int(regcount / printevery) != lastReg:
regdiff = regcount - lastcount
lastcount = regcount
lastReg = int(regcount / printevery)
now = time.time()
elalast = now - lasttime
ela = now - start
lasttime = now
speedlast = int(regdiff / elalast)
speed = int(regcount / ela )
lendups = len(dups)
lenseqs = len(seqs)
if paired:
lenseqs = len(seqs)/2
sys.stdout.write( '%s%14s registers; %14s reg/s avg; %14s reg/s last; %14s valid; %14s invalid; %6.2f%% invalid\n' % (countstr, "{:,}".format(regcount), "{:,}".format(speed), "{:,}".format(speedlast), "{:,}".format(lenseqs), "{:,}".format(lendups), (lendups / float(regcount) * 100) ) )
if not onlyid:
fhd1i.close()
fhd1o.close()
fhd1u.close()
if paired:
fhd2i.close()
fhd2o.close()
fhd2u.close()
repmode = 'w'
if merge and os.path.exists( excluded ):
repmode = 'a'
print "Saving report %s:\n " % countstr, report
with open( report , repmode) as fhd:
if countstr != "":
fhd.write( "%s\n" % countstr )
fhd.write( "INPUT FILES : %s\n" % " ".join([fn1, fn2]) )
fhd.write( "TOTAL : %d\n" % regcount )
fhd.write( "UNIQUE : %d\n" % (len(seqs)/2) )
fhd.write( "DUPLICATES : %d\n" % (len(dups) ) )
fhd.write( "DUPLICATES %%: %d\n" % (len(dups)/float(regcount)) )
fhd.write( "TIME : %d\n" % (int(time.time() - start + 1)) )
fhd.write( "SPEED : %d\n" % (regcount / int(time.time() - start + 1)))
fhd.write( "\n=\n\n" )
print "Saving excluded list %s:\n " % countstr, excluded
with openfile( excluded , repmode) as fhd:
for duppos in range( 0, len( dups ), 10000 ):
dup = dups[ duppos : duppos + 10000 ]
fhd.writelines( [ x+"\n" for x in dup ] )
print "Saving unique list %s:\n " % countstr, uniques
with openfile( uniques , repmode ) as fhd:
for unipos in range( 0, len( uniqs ), 10000 ):
uni = uniqs[ unipos : unipos + 10000 ]
fhd.writelines( [ x+"\n" for c in uni ] )
if __name__ == '__main__': main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment