new filter dups

6739f3c5 · Aflitos, Saulo Alves · 7bedde08 · 6739f3c5
Commit 6739f3c5 authored 12 years ago by Aflitos, Saulo Alves
--- a/pipeline/wrappers/filter_dups.py
+++ b/pipeline/wrappers/filter_dups.py
@@ -7,9 +7,12 @@ import argparse
 import hashlib
 import gzip
 import multiprocessing
+import subprocess
 import Queue
 from itertools import islice

+#./filter_dups.py --homo -n 50 -i 8000.dedup.sff 
+
 printevery = 100000
 bufferSize = 64 * 1024*1024

@@ -38,6 +41,33 @@ def openfile(infile, method, buff=bufferSize, compresslevel=1):
    return fhd


+def sff2fq(filename, outbasename):
+    cmd  = "sff_extract -c -Q --out_basename=%s %s" % (outbasename, filename)
+    print "extracting sff:",cmd
+    res = subprocess.call(cmd, shell=True)
+    if res:
+        print "error running sff extract"
+        sys.exit(1)
+
+    return outbasename + '.fastq'
+
+
+def sffFilter(fq, sff):
+    #.rmdup.excludedIds
+    fqids  = fq + '.rmdup.excludedIds'
+    sffout = fq + '.sff'
+    cmd    = "sfffile -o %s -e %s %s" % (sffout, fqids, sff)
+    print 'filtering sff:', cmd
+
+    res = subprocess.call(cmd, shell=True)
+
+    if res:
+        print "error running sff extract"
+        sys.exit(1)
+
+    pass
+
+
 def main():
    """
    Filters sequences for clonality.
@@ -69,16 +99,11 @@ def main():

    dstdir   = os.path.abspath( dstdir )

-    if nsize != -1:
-        print "SHORTENING TO %dbp" % nsize
-    if compress:
-        print "COMPRESSING IN MEMORY"
-    if onlyid:
-        print "EXPORTING ONLY ID"
-    if homopol:
-        print "COMPRESS HOMOPOLYMERS"
-    if zipfile:
-        print "COMPRESSING OUTPUT FILE"
+    if nsize != -1: print "SHORTENING TO %dbp" % nsize
+    if compress   : print "COMPRESSING IN MEMORY"
+    if onlyid     : print "EXPORTING ONLY ID"
+    if homopol    : print "COMPRESS HOMOPOLYMERS"
+    if zipfile    : print "COMPRESSING OUTPUT FILE"


    numfiles = len(infiles)
@@ -92,15 +117,38 @@ def main():
    if ( numfiles % 2 == 0 ) and not single:
        paired = True

-    for filename in infiles:
-        if not os.path.exists( filename ):
-            print "input file %s does not exists" % filename
+
+

    if not os.path.exists( dstdir ):
        print "Destination dir %s does not exists. creating" % dstdir
        os.makedirs( dstdir )


+    sffBack = []
+
+    for filepos in range(len(infiles)):
+        filename = infiles[ filepos ]
+        if not os.path.exists( filename ):
+            print "input file %s does not exists" % filename
+            sys.exit( 1 )
+
+        if filename.endswith('.sff'):
+            outf               = filename.replace('.sff', '')
+            outf               = os.path.basename( outf )
+            outf               = os.path.join(     os.path.abspath(dstdir), outf )
+            fileback           = sff2fq(           filename, outf )
+            infiles[ filepos ] = fileback
+            sffBack.append( [fileback, filename] )
+
+        filename = infiles[ filepos ]
+        if not os.path.exists( filename ):
+            print "input file %s does not exists" % filename
+            sys.exit( 1 )
+
+
+
+
    #TODO: MULTITHREADING
    #       CREATE A THREAD TO THE WRITTER IF MERGED
    #       CREATE A THREAD TO EACH ANALIZER
@@ -124,6 +172,11 @@ def main():
            analize(infiles[filepos], ""                , paired=paired,filenum=filenum,numfiles=numfiles,dstdir=dstdir,nsize=nsize,compress=compress,onlyid=onlyid,homopol=homopol,zipfile=zipfile,merge=merge,dry_run=dry_run)


+    for fq, sff in sffBack:
+        sffFilter(fq, sff)
+
+
+
 class writter(object):
    def __init__(self):
        self.files = {}
@@ -194,10 +247,9 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
        fn1bn        = os.path.basename( fn1 )
        fn1dst       = os.path.join( dstdir, fn1bn )

+        fn1dst       += '.rmdup'
        if merge is not None:
-            fn1dst       = os.path.join( dstdir, merge  + '.rmdup.fwd' )
-        else:
-            fn1dst       = os.path.join( dstdir, merge  + '.rmdup' )
+            fn1dst       += '.fwd'


        fn1o         = fn1dst + '.good.fastq'
@@ -222,10 +274,9 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi
            fn2bn        = os.path.basename( fn2 )
            fn2dst       = os.path.join( dstdir, fn2bn )

+            fn2dst       += '.rmdup'
            if merge is not None:
-                fn2dst       = os.path.join( dstdir, merge + '.rmdup.rev')
-            else:
-                fn2dst       = os.path.join( dstdir, merge + '.rmdup')
+                fn2dst       += '.rev'

            fn2o         = fn2dst + '.good.fastq'
            fn2u         = fn2dst + '.bad.fastq'
@@ -260,7 +311,7 @@ def analize(fn1, fn2, paired=False,pairnum=None,numpairs=None,filenum=None,numfi

    excluded = bn + '.rmdup.excludedIds'
    uniques  = bn + '.rmdup.uniqueIds'
-    
+
    if zipfile:
        excluded += '.gz'
        uniques  += '.gz'