Skip to content
Snippets Groups Projects
Commit 6739f3c5 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

new filter dups

parent 7bedde08
Branches
No related tags found
No related merge requests found
......@@ -7,9 +7,12 @@ import argparse
import hashlib
import gzip
import multiprocessing
import subprocess
import Queue
from itertools import islice
#./filter_dups.py --homo -n 50 -i 8000.dedup.sff
printevery = 100000
bufferSize = 64 * 1024*1024
......@@ -38,6 +41,33 @@ def openfile(infile, method, buff=bufferSize, compresslevel=1):
return fhd
def sff2fq(filename, outbasename):
cmd = "sff_extract -c -Q --out_basename=%s %s" % (outbasename, filename)
print "extracting sff:",cmd
res = subprocess.call(cmd, shell=True)
if res:
print "error running sff extract"
sys.exit(1)
return outbasename + '.fastq'
def sffFilter(fq, sff):
#.rmdup.excludedIds
fqids = fq + '.rmdup.excludedIds'
sffout = fq + '.sff'
cmd = "sfffile -o %s -e %s %s" % (sffout, fqids, sff)
print 'filtering sff:', cmd
res = subprocess.call(cmd, shell=True)
if res:
print "error running sff extract"
sys.exit(1)
pass
def main():
"""
Filters sequences for clonality.
......@@ -69,16 +99,11 @@ def main():
dstdir = os.path.abspath( dstdir )
if nsize != -1:
print "SHORTENING TO %dbp" % nsize
if compress:
print "COMPRESSING IN MEMORY"
if onlyid:
print "EXPORTING ONLY ID"
if homopol:
print "COMPRESS HOMOPOLYMERS"
if zipfile:
print "COMPRESSING OUTPUT FILE"
if nsize != -1: print "SHORTENING TO %dbp" % nsize
if compress : print "COMPRESSING IN MEMORY"
if onlyid : print "EXPORTING ONLY ID"
if homopol : print "COMPRESS HOMOPOLYMERS"
if zipfile : print "COMPRESSING OUTPUT FILE"
numfiles = len(infiles)
......@@ -92,15 +117,38 @@ def main():
if ( numfiles % 2 == 0 ) and not single:
paired = True
for filename in infiles:
if not os.path.exists( filename ):
print "input file %s does not exists" % filename
if not os.path.exists( dstdir ):
print "Destination dir %s does not exists. creating" % dstdir
os.makedirs( dstdir )
sffBack = []
for filepos in range(len(infiles)):
filename = infiles[ filepos ]
if not os.path.exists( filename ):
print "input file %s does not exists" % filename
sys.exit( 1 )
if filename.endswith('.sff'):
outf = filename.replace('.sff', '')
outf = os.path.basename( outf )
outf = os.path.join( os.path.abspath(dstdir), outf )
fileback = sff2fq( filename, outf )
infiles[ filepos ] = fileback
sffBack.append( [fileback, filename] )
filename = infiles[ filepos ]
if not os.path.exists( filename ):
print "input file %s does not exists" % filename
sys.exit( 1 )
#TODO: MULTITHREADING
# CREATE A THREAD TO THE WRITTER IF MERGED
# CREATE A THREAD TO EACH ANALIZER
......@@ -124,6 +172,11 @@ def main():
analize(infiles[filepos], "" , paired=paired,filenum=filenum,numfiles=numfiles,dstdir=dstdir,nsize=nsize,compress=compress,onlyid=onlyid,homopol=homopol,zipfile=zipfile,merge=merge,dry_run=dry_run)
for fq, sff in sffBack:
sffFilter(fq, sff)
class writter(object):
def __init__(self):
self.files = {}
......@@ -194,10 +247,9 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
fn1bn = os.path.basename( fn1 )
fn1dst = os.path.join( dstdir, fn1bn )
fn1dst += '.rmdup'
if merge is not None:
fn1dst = os.path.join( dstdir, merge + '.rmdup.fwd' )
else:
fn1dst = os.path.join( dstdir, merge + '.rmdup' )
fn1dst += '.fwd'
fn1o = fn1dst + '.good.fastq'
......@@ -222,10 +274,9 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
fn2bn = os.path.basename( fn2 )
fn2dst = os.path.join( dstdir, fn2bn )
fn2dst += '.rmdup'
if merge is not None:
fn2dst = os.path.join( dstdir, merge + '.rmdup.rev')
else:
fn2dst = os.path.join( dstdir, merge + '.rmdup')
fn2dst += '.rev'
fn2o = fn2dst + '.good.fastq'
fn2u = fn2dst + '.bad.fastq'
......@@ -260,7 +311,7 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
excluded = bn + '.rmdup.excludedIds'
uniques = bn + '.rmdup.uniqueIds'
if zipfile:
excluded += '.gz'
uniques += '.gz'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment