Commit 1e09f3dd authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

sff support

parent 5b9af50e
......@@ -7,9 +7,12 @@ import argparse
import hashlib
import gzip
import multiprocessing
import subprocess
import Queue
from itertools import islice
#./filter_dups.py --homo -n 50 -i 8000.dedup.sff
printevery = 100000
bufferSize = 64 * 1024*1024
......@@ -38,13 +41,40 @@ def openfile(infile, method, buff=bufferSize, compresslevel=1):
return fhd
def sff2fq(filename, outbasename):
cmd = "sff_extract -c -Q --out_basename=%s %s" % (outbasename, filename)
print "extracting sff:",cmd
res = subprocess.call(cmd, shell=True)
if res:
print "error running sff extract"
sys.exit(1)
return outbasename + '.fastq'
def sffFilter(fq, sff):
#.rmdup.excludedIds
fqids = fq + '.rmdup.excludedIds'
sffout = fq + '.sff'
cmd = "sfffile -o %s -e %s %s" % (sffout, fqids, sff)
print 'filtering sff:', cmd
res = subprocess.call(cmd, shell=True)
if res:
print "error running sff extract"
sys.exit(1)
pass
def main():
"""
Filters sequences for clonality.
"""
cdir = os.curdir
parser = argparse.ArgumentParser(description='filter fastq files')
parser.add_argument('-n', '--nsize' , dest='nsize' , default=-1 , metavar='N', type=int , nargs='?', help='number of reads to use [INT: default: -1 (all)]')
parser.add_argument('-n', '--nsize' , dest='nsize' , default=-1 , metavar='N', type=int , nargs='?', help='number of bp to use [INT: default: -1 (all)]')
parser.add_argument('-d', '--dir' , dest='dir' , default=cdir, metavar='D', type=str , nargs='?', help='output dir [STR: default: .]')
parser.add_argument('-m', '--merge' , dest='merge' , default=None, metavar='M', type=str , nargs='?', help='merge output using prefix')
parser.add_argument('-s', '--single' , dest='single' , action='store_true', help='treat as single end')
......@@ -69,16 +99,11 @@ def main():
dstdir = os.path.abspath( dstdir )
if nsize != -1:
print "SHORTENING TO %dbp" % nsize
if compress:
print "COMPRESSING IN MEMORY"
if onlyid:
print "EXPORTING ONLY ID"
if homopol:
print "COMPRESS HOMOPOLYMERS"
if zipfile:
print "COMPRESSING OUTPUT FILE"
if nsize != -1: print "SHORTENING TO %dbp" % nsize
if compress : print "COMPRESSING IN MEMORY"
if onlyid : print "EXPORTING ONLY ID"
if homopol : print "COMPRESS HOMOPOLYMERS"
if zipfile : print "COMPRESSING OUTPUT FILE"
numfiles = len(infiles)
......@@ -92,15 +117,38 @@ def main():
if ( numfiles % 2 == 0 ) and not single:
paired = True
for filename in infiles:
if not os.path.exists( filename ):
print "input file %s does not exists" % filename
if not os.path.exists( dstdir ):
print "Destination dir %s does not exists. creating" % dstdir
os.makedirs( dstdir )
sffBack = []
for filepos in range(len(infiles)):
filename = infiles[ filepos ]
if not os.path.exists( filename ):
print "input file %s does not exists" % filename
sys.exit( 1 )
if filename.endswith('.sff'):
outf = filename.replace('.sff', '')
outf = os.path.basename( outf )
outf = os.path.join( os.path.abspath(dstdir), outf )
fileback = sff2fq( filename, outf )
infiles[ filepos ] = fileback
sffBack.append( [fileback, filename] )
filename = infiles[ filepos ]
if not os.path.exists( filename ):
print "input file %s does not exists" % filename
sys.exit( 1 )
#TODO: MULTITHREADING
# CREATE A THREAD TO THE WRITTER IF MERGED
# CREATE A THREAD TO EACH ANALIZER
......@@ -124,6 +172,11 @@ def main():
analize(infiles[filepos], "" , paired=paired,filenum=filenum,numfiles=numfiles,dstdir=dstdir,nsize=nsize,compress=compress,onlyid=onlyid,homopol=homopol,zipfile=zipfile,merge=merge,dry_run=dry_run)
for fq, sff in sffBack:
sffFilter(fq, sff)
class writter(object):
def __init__(self):
self.files = {}
......@@ -194,11 +247,13 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
fn1bn = os.path.basename( fn1 )
fn1dst = os.path.join( dstdir, fn1bn )
fn1dst += '.rmdup'
if merge is not None:
fn1dst = os.path.join( dstdir, merge + '.fwd' )
fn1dst += '.fwd'
fn1o = fn1dst + '.rmdup.good.fastq'
fn1u = fn1dst + '.rmdup.bad.fastq'
fn1o = fn1dst + '.good.fastq'
fn1u = fn1dst + '.bad.fastq'
if zipfile:
fn1o += '.gz'
......@@ -219,11 +274,12 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
fn2bn = os.path.basename( fn2 )
fn2dst = os.path.join( dstdir, fn2bn )
fn2dst += '.rmdup'
if merge is not None:
fn2dst = os.path.join( dstdir, merge + '.rev')
fn2dst += '.rev'
fn2o = fn2dst + '.rmdup.good.fastq'
fn2u = fn2dst + '.rmdup.bad.fastq'
fn2o = fn2dst + '.good.fastq'
fn2u = fn2dst + '.bad.fastq'
if zipfile:
fn2o += '.gz'
......@@ -253,8 +309,9 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
report = bn + '.rmdup.report'
excluded = bn + '.rmdup.excluded'
uniques = bn + '.rmdup.uniques'
excluded = bn + '.rmdup.excludedIds'
uniques = bn + '.rmdup.uniqueIds'
if zipfile:
excluded += '.gz'
uniques += '.gz'
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment