From 6df8e61f5066cf3e6fef8c4d83fc538bae6525f9 Mon Sep 17 00:00:00 2001
From: Saulo Aflitos <sauloalves.aflitos@wur.nl>
Date: Fri, 29 Apr 2016 00:17:52 +0200
Subject: [PATCH] mods

---
 newest/vcfconcat.py                    | 318 ++++++++++++++-----------
 newest/vcffiltergff.py                 |  57 +++--
 newest/vcfmerger.py                    | 299 ++++++++++-------------
 scripts/median_polish/median_polish.py | 168 ++++++++-----
 4 files changed, 456 insertions(+), 386 deletions(-)

diff --git a/newest/vcfconcat.py b/newest/vcfconcat.py
index 011387a..82710b6 100755
--- a/newest/vcfconcat.py
+++ b/newest/vcfconcat.py
@@ -3,44 +3,54 @@
 Concatenates all SNPs from a VCF file in either FASTA or aln (clustal) format
 """
 import os, sys
-from pprint import pprint as pp
 import copy
 import argparse
 import multiprocessing
+from pprint      import pprint as pp
 from collections import defaultdict, Counter
 
 sys.path.insert(0, '.')
 import vcfmerger
 import editdist
-
-
+from treemanager import fixsppname
 
 #GZ=SL2.40ch06g50000_000100001_000150000.vcf.gz.raw.vcf.gz; FA=$GZ.SL2.40ch06.fasta; ../vcfconcat.py -f -RIL -Rg -Rd -i $GZ; ../FastTreeMP -fastest -gamma -nt -bionj -boot 100 -log $FA.log -out $FA.tree $FA; ../FastTreeMP -nt -makematrix $FA > $FA.matrix; ./newick_to_png.py $FA.tree
 #FA=SL2.40ch06g50000_000100001_000150000.vcf.gz.SL2.40ch06.fasta; ../FastTreeMP -fastest -gamma -nt -bionj -boot 100 -log $FA.log -out $FA.tree $FA; ../FastTreeMP -nt -makematrix $FA > $FA.matrix; ./newick_to_png.py $FA.tree
 
 
 
-def main():
-    parser = argparse.ArgumentParser(description='Concatenate SNPs as a single sequence for each species.')
-    parser.add_argument('-c', '--chrom', '--chromosome', dest='chromosome', default=None , action='store'      , nargs='?',                       type=str  , help='Chromosome to filter [all]')
-    parser.add_argument('-I', '--ignore', '--skip'     , dest='ignore'    , default=[]   , action='append'     , nargs='*',                       type=str  , help='Chromosomes to skip')
-    parser.add_argument('-s', '--start'                , dest='start'     , default=None , action='store'      , nargs='?',                       type=int  , help='Chromosome start position to filter [0]')
-    parser.add_argument('-e', '--end'                  , dest='end'       , default=None , action='store'      , nargs='?',                       type=int  , help='Chromosome end position to filter [-1]')
-    parser.add_argument('-t', '--threads'              , dest='threads'   , default=0    , action='store'      , nargs='?',                       type=int  , help='Number of threads [num chromosomes]')
-    parser.add_argument('-f', '--fasta'                , dest='fasta'     ,                action='store_true' ,                                              help='Output in fasta format [default: clustal alignment .aln format]')
-    parser.add_argument('-r', '--noref'                , dest='noref'     ,                action='store_false',                                              help='Do not print reference [default: true]')
-    parser.add_argument('-R', '--RIL'                  , dest='RIL'       ,                action='store_true' ,                                              help='RIL mode: false]')
-    parser.add_argument('-Rm','--RIL-mads'             , dest='RILmads'   , default=0.25 , action='store'      , nargs='?',                       type=float, help='RIL percentage of Median Absolute Deviation to use (smaller = more restrictive): 0.25]')
-    parser.add_argument('-Rs','--RIL-minsim'           , dest='RILminsim' , default=0.75 , action='store'      , nargs='?',                       type=float, help='RIL percentage of nucleotides identical to reference to classify as reference: 0.75]')
-    parser.add_argument('-Rg','--RIL-greedy'           , dest='RILgreedy'                , action='store_true'                                              , help='RIL greedy convert nucleotides to either the reference sequence or the alternative sequence: false]')
-    parser.add_argument('-Rd','--RIL-delete'           , dest='RILdelete'                , action='store_true'                                              , help='RIL delete invalid sequences: false]')
+def main(args):
+    methods = {
+        'median' : grouper_median,
+        'linkage': grouper_linkage
+    }
 
-    parser.add_argument('-i', '--input'                , dest='iinput'    , default=None ,                                                        type=str  , help='Input file')
-    parser.add_argument('input'                        ,                    default=None , action='store'      , nargs='?', metavar='input file', type=str  , help='Input file')
 
-    options = parser.parse_args()
+    dflmethod = methods.keys()[0]
 
 
+    parser = argparse.ArgumentParser(description='Concatenate SNPs as a single sequence for each species.')
+    parser.add_argument('-c', '--chrom', '--chromosome', dest='chromosome' , default=None     , action='store'      , nargs='?',                         type=str  , help='Chromosome to filter [all]')
+    parser.add_argument('-I', '--ignore', '--skip'     , dest='ignore'     , default=[]       , action='append'     , nargs='*',                         type=str  , help='Chromosomes to skip')
+    parser.add_argument('-s', '--start'                , dest='start'      , default=None     , action='store'      , nargs='?',                         type=int  , help='Chromosome start position to filter [0]')
+    parser.add_argument('-e', '--end'                  , dest='end'        , default=None     , action='store'      , nargs='?',                         type=int  , help='Chromosome end position to filter [-1]')
+    parser.add_argument('-t', '--threads'              , dest='threads'    , default=0        , action='store'      , nargs='?',                         type=int  , help='Number of threads [num chromosomes]')
+    parser.add_argument('-f', '--fasta'                , dest='fasta'      ,                    action='store_true' ,                                                help='Output in fasta format [default: clustal alignment .aln format]')
+    parser.add_argument('-r', '--noref'                , dest='noref'      ,                    action='store_false',                                                help='Do not print reference [default: true]')
+    parser.add_argument('-R', '--RIL'                  , dest='RIL'        ,                    action='store_true' ,                                                help='RIL mode: false]')
+    parser.add_argument('-Rm','--RIL-mads'             , dest='RILmads'    , default=0.25     , action='store'      , nargs='?',                         type=float, help='RIL percentage of Median Absolute Deviation to use (smaller = more restrictive): 0.25]')
+    parser.add_argument('-Rs','--RIL-minsim'           , dest='RILminsim'  , default=0.75     , action='store'      , nargs='?',                         type=float, help='RIL percentage of nucleotides identical to reference to classify as reference: 0.75]')
+    parser.add_argument('-Rg','--RIL-greedy'           , dest='RILgreedy'  ,                    action='store_true' ,                                                help='RIL greedy convert nucleotides to either the reference sequence or the alternative sequence: false]')
+    parser.add_argument('-Rd','--RIL-delete'           , dest='RILdelete'  ,                    action='store_true' ,                                                help='RIL delete invalid sequences: false]')
+    parser.add_argument('-M' ,'--RIL-method'           , dest='groupMethod', default=dflmethod, action='store'      , nargs='?', choices=methods.keys(), type=str  , help='Clustering method for RIL selection of good and bad sequences [' + ','.join(methods.keys()) + ']')
+
+    parser.add_argument('-i', '--input'                , dest='input'      , default=None     ,                       nargs='?',                         type=str  , help='Input file')
+    #parser.add_argument('input'                        ,                    default=None     , action='store'      , nargs='?', metavar='input file', type=str  , help='Input file')
+
+    options = parser.parse_args(args)
+
+    print options
+
     indexFile    = None
     parallel     = False
 
@@ -67,20 +77,37 @@ def main():
 
 
 
-    config['infile'   ] = getOptionInFile(parser, options)
-    config['infile'   ] = vcfmerger.checkfile(config['infile'])
+    config['infile'     ] = options.input
+    config['ignore'     ] = options.ignore
+    config['inchr'      ] = options.chromosome
+    config['inend'      ] = options.end
+    config['instart'    ] = options.start
+    config['noref'      ] = options.noref
+    config['threads'    ] = options.threads
+    config['RIL'        ] = options.RIL
+    config['RILmads'    ] = options.RILmads
+    config['RILminsim'  ] = options.RILminsim
+    config['RILgreedy'  ] = options.RILgreedy
+    config['RILdelete'  ] = options.RILdelete
+    config['groupMethod'] = options.groupMethod
+
+
+
+    if config['groupMethod'] not in methods:
+        print "%s not a valid method" % config['groupMethod']
+        sys.exit(1)
+
+    config[ 'grouper' ] = methods[ config['groupMethod'] ]
 
-    config['ignore'   ] = options.ignore
-    config['inchr'    ] = options.chromosome
-    config['inend'    ] = options.end
-    config['instart'  ] = options.start
-    config['noref'    ] = options.noref
-    config['threads'  ] = options.threads
-    config['RIL'      ] = options.RIL
-    config['RILmads'  ] = options.RILmads
-    config['RILminsim'] = options.RILminsim
-    config['RILgreedy'] = options.RILgreedy
-    config['RILdelete'] = options.RILdelete
+
+
+    if options.input is None:
+        print "no input file defined"
+        sys.exit(1)
+
+    if not os.path.exists(options.input):
+        print "input file %s does not exists" % options.input
+        sys.exit(1)
 
 
     if options.fasta:
@@ -123,6 +150,8 @@ def main():
     else:
         readData(config)
 
+    return config
+
 
 def getOptionInFile(pars, opts):
     """
@@ -196,51 +225,50 @@ def readSources(config):
     """
     print "reading sources"
     print "opening",
-    infhd  = vcfmerger.openvcffile(config['infile'], 'r')
-    print "ok"
-
-    print "reading"
-
-    header    = ""
-
-    sources   = {}
-    names     = None
-    for line in infhd:
-        line = line.strip()
-        if len(line) == 0: continue
-        if line[0]   == '#':
-            header += line + "\n"
-            if line.startswith('##sources='):
-                names = line[10:].split(';')
-                for name in names:
-                    sources[name] = []
-                #pp(sources)
-
-            elif line.startswith('##numsources='):
-                numsources = int(line[13:])
-                if numsources != len(sources):
-                    print "error parsing sources"
-                    sys.exit(1)
-                else:
-                    print "num sources:",numsources
+    with vcfmerger.openvcffile(config['infile'], 'r') as infhd:
+        print "ok"
+
+        print "reading"
+
+        header    = ""
+
+        sources   = {}
+        names     = None
+        for line in infhd:
+            line = line.strip()
+            if len(line) == 0: continue
+            if line[0]   == '#':
+                header += line + "\n"
+                if line.startswith('##sources='):
+                    names = line[10:].split(';')
+                    for name in names:
+                        sources[name] = []
+                    #pp(sources)
+
+                elif line.startswith('##numsources='):
+                    numsources = int(line[13:])
+                    if numsources != len(sources):
+                        print "error parsing sources", numsources, len(sources),sources, len(names), names
+                        sys.exit(1)
+                    else:
+                        print "num sources:",numsources
 
-            continue
-        break
+                continue
+            break
 
 
-    sppmaxlength = 0
-    for spp in sorted(sources):
-        sppname = fixsppname( spp )
-        if len(sppname) > sppmaxlength: sppmaxlength = len(sppname)
+        sppmaxlength = 0
+        for spp in sorted(sources):
+            sppname = fixsppname( spp )
+            if len(sppname) > sppmaxlength: sppmaxlength = len(sppname)
 
-    sppmaxlength += 2
-    sppmaxlength  = "%-"+str(sppmaxlength)+"s"
+        sppmaxlength += 2
+        sppmaxlength  = "%-"+str(sppmaxlength)+"s"
 
-    infhd.close()
-    config['sppmaxlength'] = sppmaxlength
-    config['sources'     ] = sources
-    config['names'       ] = names
-    config['header'      ] = header
+        config['sppmaxlength'] = sppmaxlength
+        config['sources'     ] = sources
+        config['names'       ] = names
+        config['header'      ] = header
 
 
 def readData(config):
@@ -461,13 +489,23 @@ def parse(config, refs, chro):
 
     printfilecoords(config, chro)
 
-    if config['oufhd'] is not None:
+    if  config['oufhd'] is not None:
         config['oufhd'].write('\n\n')
         config['oufhd'].close()
         config['oufhd'] = None
 
 
 def RIL( config, refsStrs, sourcesStrs, emptyStr ):
+    high, low, valids, invalids     = config[ 'grouper' ]( config, refsStrs, sourcesStrs, emptyStr )
+
+    #valids, invalids                = filterSimilarity( valids, invalids, config, refsStrs, sourcesStrs, emptyStr )
+
+    refsStrs, sourcesStrs, emptyStr = filter( high, low, valids, invalids, config, refsStrs, sourcesStrs, emptyStr )
+
+    return ( refsStrs, sourcesStrs, emptyStr )
+
+
+def grouper_median( config, refsStrs, sourcesStrs, emptyStr ):
     distMatrix  = {}
 
     mindist     = sys.maxint
@@ -514,18 +552,18 @@ def RIL( config, refsStrs, sourcesStrs, emptyStr ):
 
     good_low      = [ k for k in distMatrix if distMatrix[k] <= threshold_low ]
     good_hig      = [ k for k in distMatrix if distMatrix[k] >= threshold_hig ]
-    bad           = [ k for k in distMatrix if distMatrix[k] > threshold_low and distMatrix[k] < threshold_hig ]
-    good          = good_low + good_hig
+    invalids      = [ k for k in distMatrix if distMatrix[k] > threshold_low and distMatrix[k] < threshold_hig ]
+    valids        = good_low + good_hig
 
     good_low.sort()
     good_hig.sort()
-    bad.sort()
-    good.sort()
+    invalids.sort()
+    valids.sort()
 
     print 'GOOD LOW      ', len(good_low), good_low
     print 'GOOD HIGH     ', len(good_hig), good_hig
-    print 'BAD           ', len(bad     ), bad
-    print 'GOOD          ', len(good    ), good
+    print 'INVALIDS      ', len(invalids), invalids
+    print 'VALIDS        ', len(valids  ), valids
 
     #print 'DISTS         ', len(dists), " ".join(["%3d" % x for x in dists])
     #
@@ -563,11 +601,11 @@ def RIL( config, refsStrs, sourcesStrs, emptyStr ):
     #print 'GOOD HIGH     ', len(good_hig), good_hig
     #print 'BAD           ', len(bad     ), bad
 
-    return refsStrs, sourcesStrs, emptyStr
+    return ( good_hig, good_low, valids, invalids )
 
 
 
-def RILo( config, refsStrs, sourcesStrs, emptyStr ):
+def grouper_linkage( config, refsStrs, sourcesStrs, emptyStr ):
     #entropy_cutoff = config[ 'RILentro' ]
     #
     ##print "len names", len(names)
@@ -596,8 +634,6 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
 
 
     tolerance   = config[ 'RILmads'   ]
-    greedy      = config[ 'RILgreedy' ]
-    minsim      = config[ 'RILminsim' ]
 
     distMatrix  = {}
 
@@ -695,7 +731,7 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
 
     print 'MAD min before', tolvalsminMAD
     print 'MAD max before', tolvalsmaxMAD
-    print 'tolerance', tolerance
+    print 'tolerance     ', tolerance
     if tolerance > 0:
         tolvalsminMAD    = tolvalsminMAD * tolerance
         tolvalsmaxMAD    = tolvalsmaxMAD * tolerance
@@ -757,8 +793,8 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
     simonly   = similar.difference(      dissimilar )
     disonly   = dissimilar.difference(   similar    )
 
-    valids   = set().union(similar, dissimilar)
-    invalids = set(sourceskeys).difference(valids)
+    valids    = set().union(similar, dissimilar)
+    invalids  = set(sourceskeys).difference(valids)
 
     print "Similar  ", len(sorted(list( similar    ))), sorted(list( similar    ))
     print "Disimilar", len(sorted(list( dissimilar ))), sorted(list( dissimilar ))
@@ -780,9 +816,59 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
     #Valids    45 ['609', '610', '612', '614', '615', '618', '623', '625', '626', '634', '644', '649', '651', '653', '654', '659', '660', '665', '666', '667', '668', '670', '674', '676', '679', '684', '685', '688', '691', '692', '693', '694', '696', '697', '702', '705', '706', '707', '710', '711', 'Slycopersicum MoneyMaker', 'Spimpinellifolium LA1578', 'Spimpinellifolium LA1584', 'Spimpinellifolium LYC2740', 'Spimpinellifolium LYC2798']
     #Invalids  20 ['601', '603', '608', '611', '619', '622', '624', '630', '631', '639', '643', '646', '648', '656', '658', '669', '675', '678', '682', '701']
 
+    return ( sorted(list(dissimilar)), sorted(list(similar)), sorted(list( valids )), sorted(list( invalids)) )
 
 
 
+def filterSimilarity( high, low, valids, invalids, config, refsStrs, sourcesStrs, emptyStr ):
+    minsim      = config[ 'RILminsim' ]
+
+    # CALCULATE PERCENTAGE OF SIMILARITY TO REFERENCE
+    refsims = {}
+
+    for spp in valids:
+        sppseq = sourcesStrs[ spp ]
+        refsim = 0
+
+        for nucPos, nucSeq in enumerate( sppseq ):
+            refSeq = refsStrs[ nucPos ]
+
+            if refSeq == nucSeq:
+                refsim += 1
+
+        refsims[ spp ] = ( float( refsim ) / float( len( sppseq ) ) )
+
+
+    #print 'REF', refsStrs
+    #print 'ALT', altsStrs
+
+
+    # REPLACE SEQUENCES WITH EITHER REFERENCE OR ALTERNATIVE
+    modsCount = defaultdict( int )
+    for spp in sourcesStrs:
+        if spp in refsims:
+            if refsims[ spp ] > minsim:
+                modsCount[ 'REF' ] += 1
+
+            else:
+                modsCount[ 'ALT' ] += 1
+                invalids[ spp ] = valids[ spp ]
+                del valids[ spp ]
+
+
+    #for sppPos, sppNucs in enumerate( nucs ):
+        #print sppNucs, list(sppNucs), dict(sppNucs), sppNucs.values()
+
+    for key in modsCount:
+        print "count", key, modsCount[ key ]
+    print "count NNN", len( invalids )
+
+    return ( valids, invalids, config, refsStrs, sourcesStrs, emptyStr )
+
+
+def filter( high, low, valids, invalids, config, refsStrs, sourcesStrs, emptyStr ):
+    greedy      = config[ 'RILgreedy' ]
+
     # IF IN GREEDY MODE, FIX NUCLEOTIDES FROM VALIDs
     if greedy:
         # COUNT NUCLEOTIDES IN EACH POSITION
@@ -792,9 +878,7 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
             for nucPos, nuc in enumerate( sppseq ):
                 if len( nucs ) <= nucPos:
                     nucs.append( Counter() )
-                nucs[ nucPos ].update( [ nuc ])
-
-
+                nucs[ nucPos ].update( [ nuc ] )
 
         # GENERATE ALTERNATIVE STRING
         altsStrs  = ""
@@ -812,7 +896,7 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
             else:
                 #print 'pos %3d data %-10s ref %s' % ( nucPos, str(nucData), nucRef ),
                 nucData.remove( nucRef )
-                nucAlt  = nucData[0]
+                nucAlt     = nucData[0]
                 #print 'alt', nucAlt
                 altsStrs  += nucAlt
                 refsStrsN += nucRef
@@ -821,47 +905,19 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
         refsStrs = refsStrsN
 
 
-        # CALCULATE PERCENTAGE OF SIMILARITY TO REFERENCE
-        refsims = {}
-
-        for spp in valids:
-            sppseq = sourcesStrs[ spp ]
-            refsim = 0
-
-            for nucPos, nucSeq in enumerate( sppseq ):
-                refSeq = refsStrs[ nucPos ]
-
-                if refSeq == nucSeq:
-                    refsim += 1
 
-            refsims[ spp ] = ( float( refsim ) / float( len( sppseq ) ) )
 
-
-        #print 'REF', refsStrs
-        #print 'ALT', altsStrs
-
-
-
-        # REPLACE SEQUENCES WITH EITHER REFERENCE OR ALTERNATIVE
-        modsCount = defaultdict( int )
         for spp in sourcesStrs:
-            if spp in refsims:
-                if refsims[ spp ] > minsim:
-                    #print " fixing spp %-30s to REF" % ( spp )
-                    sourcesStrs[ spp ]  = refsStrs
-                    modsCount[ 'REF' ] += 1
+            if spp in low:
+                sourcesStrs[ spp ]  = refsStrs
+            #print " fixing spp %-30s to REF" % ( spp )
+            #print " fixing spp %-30s to ALT" % ( spp )
+            #sourcesStrs[ spp ]  = refsStrs
+            #sourcesStrs[ spp ]  = altsStrs
+            elif spp in high:
+                sourcesStrs[ spp ]  = altsStrs
 
-                else:
-                    #print " fixing spp %-30s to ALT" % ( spp )
-                    sourcesStrs[ spp ]  = altsStrs
-                    modsCount[ 'ALT' ] += 1
 
-        #for sppPos, sppNucs in enumerate( nucs ):
-            #print sppNucs, list(sppNucs), dict(sppNucs), sppNucs.values()
-
-        for key in modsCount:
-            print "count", key, modsCount[ key ]
-        print "count NNN", len( invalids )
 
         if config['RILdelete']:
             for spp in sourcesStrs.keys():
@@ -890,12 +946,6 @@ def RILo( config, refsStrs, sourcesStrs, emptyStr ):
 
 
 
-def fixsppname(spp):
-    """
-    Fix species name to be tree printable
-    """
-    return spp.replace(' ', '_').replace('(', '').replace(')', '').replace("'", '')
-
 
 def printfilename(config, chro, coords=False):
     inchr   = config['inchr'  ]
@@ -991,4 +1041,4 @@ def printfilecoords(config, chro):
 
 
 if __name__ == '__main__':
-    main()
+    main(sys.argv[1:])
diff --git a/newest/vcffiltergff.py b/newest/vcffiltergff.py
index 99d307a..6ab51b1 100755
--- a/newest/vcffiltergff.py
+++ b/newest/vcffiltergff.py
@@ -11,12 +11,12 @@ import argparse
 #import multiprocessing
 
 sys.path.insert(0, '.')
-import vcfmerger
+import filemanager
 import vcfconcat
 
 
 
-def main():
+def main(args):
     parser = argparse.ArgumentParser(description='Filters VCF file according to parameters')
     parser.add_argument('-c', '--chrom', '--chromosome', dest='chromosome', default=None, action='store'     , nargs='?',                       type=str, help='Chromosome to filter [all]')
     parser.add_argument('-g', '--gff'                  , dest='gff'       , default=None, action='store'     , nargs='?',                       type=str, help='Gff Coordinate file')
@@ -31,11 +31,11 @@ def main():
     parser.add_argument('-f', '--folder'               , dest='outfolder' , default=None, action='store'     , nargs='?',                       type=str, help='Overrride default output folder name')
     parser.add_argument('-o', '--out', '--output'      , dest='outfile'   , default=None, action='store'     , nargs='?',                       type=str, help='Overrride default output name')
 
-    parser.add_argument('-i', '--input'                , dest='iinput'    , default=None,                                                       type=str, help='Input file')
-    parser.add_argument('input'                        ,                    default=None, action='store'     , nargs='?', metavar='input file', type=str, help='Input file')
+    parser.add_argument('-i', '--input'                , dest='input'     , default=None ,                     nargs='?',                       type=str, help='Input file')
+    #parser.add_argument('input'                        ,                    default=None, action='store'     , nargs='?', metavar='input file', type=str, help='Input file')
 
 
-    options = parser.parse_args()
+    options = parser.parse_args(args)
 
     config = {
         'infile'       : None,
@@ -56,10 +56,16 @@ def main():
 
     print options
 
-    config['infile'   ] = vcfconcat.getOptionInFile(parser, options)
-    config['infile'   ] = vcfmerger.checkfile(config['infile'])
+    config['infile'   ] = options.input
     config['outfile'  ] = options.outfile
 
+    if options.input is None:
+        print "no input file defined"
+        sys.exit(1)
+
+    if not os.path.exists(options.input):
+        print "input file %s does not exists" % options.input
+        sys.exit(1)
 
 
     config['outfolder'] = options.outfolder
@@ -104,9 +110,9 @@ def main():
         """
         Creates index file for VCF file
         """
-        vcfmerger.makeIndexFile( indexFile, config['infile'] )
+        filemanager.makeIndexFile( indexFile, config['infile'] )
 
-    config['idx'] = vcfmerger.readIndex(indexFile)
+    config['idx'] = filemanager.readIndex(indexFile)
 
     if config['ingff'] is not None:
         """
@@ -130,6 +136,11 @@ def main():
     """
     readData(config, verbose=verbose)
 
+    with open(config['outfolder']+'.ok', 'w') as fhd:
+        fhd.write(str(config))
+
+    return config
+
 
 def readData(config, verbose=False):
     """
@@ -141,7 +152,7 @@ def readData(config, verbose=False):
     """
     Creates a VCFMERGER instance to read the VCF file
     """
-    config['infhd']  = vcfmerger.openvcffile(config['infile'], 'r')
+    config['infhd']  = filemanager.openvcffile(config['infile'], 'r')
 
 
     if config['outfile'] is None:
@@ -169,7 +180,7 @@ def readData(config, verbose=False):
     """
     Creates a VCFMERGER to save the output
     """
-    config['oufhd']  = vcfmerger.openvcffile(config['outfn'], 'w')
+    config['oufhd']  = filemanager.openvcffile(config['outfn'], 'w')
     config['oufhd'].write( config['header'] )
 
     runName = "all"
@@ -223,6 +234,7 @@ def readData(config, verbose=False):
     finalChro  = False
     numSnps    = {}
     valids     = {}
+    validst    = 0
 
     for line in config['infhd']:
         """
@@ -251,6 +263,7 @@ def readData(config, verbose=False):
         #src  =     cols[3]
         #dst  =     cols[4]
 
+        #print cols
 
         if lastChro != chro:
             print chro
@@ -345,9 +358,10 @@ def readData(config, verbose=False):
             if chro not in valids:
                 valids[chro] = 0
             valids[chro] += 1
+            validst      += 1
 
             if valids[chro] % 1000 == 0:
-                print "reading data :: %s :: %s %12d SNPs valid" % (runName, chro, valids[chro])
+                print "reading data :: %s :: %s %12d SNPs valid %12d SNPs valid in total" % (runName, chro, valids[chro], validst)
 
             lastPosVal  = posi
 
@@ -356,14 +370,16 @@ def readData(config, verbose=False):
             If separated in different files
             """
 
+            #print "getting last register"
             last_register     = gffreader.get_last_register()
-            last_register_key = ( last_register['lastChrom'], last_register['lastStart'], last_register['lastEnd'], last_register['lastName']  )
+            #print last_register
+            last_register_key = ( last_register['lastChrom'], last_register['lastStart'], last_register['lastEnd'], last_register['lastName'] )
 
-            if last_register['prevStart'] < 0:
+            if last_register['lastStart'] < 0:
                 """
                 If first register (no previous), skip
                 """
-                #print "continuing"
+                print "If first register (no previous), skip. continuing"
                 continue
 
             if config['knife_last_register_key'] != last_register_key:
@@ -408,10 +424,11 @@ def readData(config, verbose=False):
                 knife_out_fn += ".vcf.gz"
 
                 print " saving to knifed %s" % knife_out_fn
+                #sys.exit(1)
 
                 config['knife_last_register_key'] = last_register_key
                 config['knife_out_fn']            = knife_out_fn
-                config['knife_out_fhd']           = vcfmerger.openvcffile(knife_out_fn, 'w')
+                config['knife_out_fhd']           = filemanager.openvcffile(knife_out_fn, 'w')
 
                 knife_extra                       = [
                     "##KNIFE-chromosome=%s" %     last_register_key[0] ,
@@ -478,9 +495,9 @@ class gffReader(object):
 
         gffFileIdx = gffFile + ".idx"
         if not os.path.exists(gffFileIdx):
-            vcfmerger.makeIndexFile(gffFileIdx, gffFile)
+            filemanager.makeIndexFile(gffFileIdx, gffFile)
 
-        self.index      = vcfmerger.readIndex(gffFileIdx)
+        self.index      = filemanager.readIndex(gffFileIdx)
         self.lastChrom  = None
         self.lastName   = "-"
         self.lastStart  = None
@@ -541,7 +558,7 @@ class gffReader(object):
         """
         Open GFF file
         """
-        self.fhd    = vcfmerger.openfile(self.infile, 'r')
+        self.fhd    = filemanager.openfile(self.infile, 'r')
 
     def readRegister(self):
         """
@@ -982,4 +999,4 @@ class fasta(object):
         return self.index[chro]['len']
 
 if __name__ == '__main__':
-    main()
+    main(sys.argv[1:])
diff --git a/newest/vcfmerger.py b/newest/vcfmerger.py
index 79ee15f..df18ab0 100755
--- a/newest/vcfmerger.py
+++ b/newest/vcfmerger.py
@@ -11,6 +11,9 @@ import math
 import copy
 from pprint import pprint as pp
 
+sys.path.insert(0, '.')
+from filemanager import getFh,openvcffile,openfile,checkfile,makeIndexFile,readIndex
+
 #try:
 #    sys.path.insert(0, 'aux/pypy-2.0/pypy-2.0-beta2/pypy-pypy-4b60269153b5/')
 #    from rpython.rlib.jit import JitDriver, purefunction
@@ -132,131 +135,6 @@ def getBits(val):
     return res
 
 
-def checkfile(infile):
-    """
-    Checks whether file exists
-    """
-    if infile is None:
-        print "no input file given"
-        sys.exit(1)
-
-    if isinstance(infile, list):
-        if len(infile) > 1:
-            print "more than one file given"
-            print infile
-            sys.exit(1)
-
-        infile = infile[0]
-
-    if not os.path.exists(infile):
-        print "input file %s does not exists" % infile
-        sys.exit(1)
-
-    return infile
-
-
-def openfile(infile, method, compresslevel=1):
-    """
-    Open file handling if compressed or not
-    """
-    fhd = None
-
-    if infile.endswith('.gz'):
-        fhd = gzip.open(infile, method+'b', compresslevel)
-
-    else:
-        fhd = open(infile, method)
-
-    return fhd
-
-
-def openvcffile(infile, method, **kwargs):
-    """
-    Open vcf file checking for extension and handling compression
-    """
-    fhd = openfile(infile, method, **kwargs)
-    if infile.endswith('.vcf.gz'):
-        pass
-
-    elif infile.endswith('.vcf'):
-        pass
-
-    else:
-        print "unknown file type"
-        sys.exit(1)
-    return fhd
-
-
-def getFh(infile):
-    """
-    Returns file handler. Handles compression given the .gz extension
-    """
-    ifh = None
-    if infile.endswith('gz'):
-        ifh = gzip.open(infile, 'rb')
-    else:
-        ifh = open(infile, 'r')
-    return ifh
-
-
-def makeIndexFile( indexFile, infile ):
-    """
-    Indexes the start position of each chromosome and creates a index file
-    """
-    print "creating index"
-    chroms  = {}
-
-    ifh     = getFh(infile)
-
-    for line in iter(ifh.readline, ''):
-        if line[0] == "#":
-            continue
-
-        line = line.strip()
-        if len(line) == 0: continue
-
-        chro = line.split("\t")[0]
-
-        if chro not in chroms:
-            beginPos     = ifh.tell() - len(line)
-            chroms[chro] = beginPos
-            print "creating index :: chrom %s curr pos %15d begin pos %15d" % (chro, ifh.tell(), beginPos)
-
-    for chro in sorted(chroms):
-        pos = chroms[chro]
-        ifh.seek(pos)
-        print ifh.readline(),
-
-    ifh.close()
-
-    with open(indexFile, 'w') as ofh:
-        for chrom in sorted(chroms):
-            ofh.write( "%s\t%d\n" % ( chrom, chroms[chrom] ) )
-
-
-def readIndex(indexFile):
-    """
-    Reads VCF index file
-    """
-    print "reading index",indexFile
-    chroms = {}
-    with open(indexFile, 'r') as ifh:
-        for line in ifh:
-            line = line.strip()
-            if len(line) == 0: continue
-            print "INDEX LINE", line,
-            chrom, pos = line.strip().split("\t")
-            pos = int(pos)
-            if pos > 0:
-                pos -= 1
-            chroms[chrom] = pos
-            print "reading index :: chrom %s pos %15d" % ( chrom, pos )
-    return chroms
-
-
-
-
-
 
 class vcfResult(object):
     """
@@ -269,6 +147,11 @@ class vcfResult(object):
     excludeSingleton = None
     noncarefiles     = None
     simpliStats      = None
+    prints           =    0
+    printsReal       =    0
+    printsRealLast   =    0
+    print_every      = 1000
+    linelen          =  100
 
     def __init__(self, simplify=SIMP_NO_SIMPLIFICATION, noncarefiles=[]):
         self.result       = None
@@ -278,6 +161,7 @@ class vcfResult(object):
         self.posCount     = None
         self.simplify     = simplify
 
+
         if vcfResult.simpliStats is None:
             vcfResult.simpliStats = {
                 'Heterozygous Dest' : 0,
@@ -303,13 +187,24 @@ class vcfResult(object):
             print "simplifying :: Heterozygous [%3d, %3d] %s" % ( SIMP_EXCL_HETEROZYGOUS, math.log(SIMP_EXCL_HETEROZYGOUS , 2 ), str(vcfResult.excludHET       ) )
             print "simplifying :: Indel        [%3d, %3d] %s" % ( SIMP_EXCL_INDEL       , math.log(SIMP_EXCL_INDEL        , 2 ), str(vcfResult.excludeINDEL    ) )
             print "simplifying :: Singleton    [%3d, %3d] %s" % ( SIMP_EXCL_SINGLETON   , math.log(SIMP_EXCL_SINGLETON    , 2 ), str(vcfResult.excludeSingleton) )
+
+            print 'Progress Legend:'
+            print ' print every       :', vcfResult.print_every
+            print ' Heterozygous Dest : h'
+            print ' Heterozygous Indel: I'
+            print ' Homozygous Indel  : i'
+            print ' Source Indel      : s'
+            print ' Singleton 1       : 1'
+            print ' Singleton 2       : 2'
+            print ' Singleton 3       : 3'
+            print ' Ok                : .'
+
             #sys.exit(0)
 
         #TODO: IMPLEMENT
         if vcfResult.noncarefiles is None:
             vcfResult.noncarefiles = noncarefiles
 
-
     def simplifier(self, srcs):
         """
         Reads each register and simplifies it
@@ -352,6 +247,29 @@ class vcfResult(object):
         return simpl
         #return srcs
 
+    def printprogress(self, msg, key=None, skip=0):
+        vcfResult.prints += 1
+
+        if skip != 0:
+            if key is None:
+                if vcfResult.prints % skip == 0:
+                    sys.stderr.write(msg)
+                    vcfResult.printsReal += 1
+
+            else:
+                if vcfResult.simpliStats[key] % skip == 0:
+                    sys.stderr.write(msg)
+                    vcfResult.printsReal += 1
+        else:
+            sys.stderr.write(msg)
+            vcfResult.printsReal += 1
+
+        if vcfResult.printsReal % vcfResult.linelen == 0 and vcfResult.printsReal != vcfResult.printsRealLast:
+            vcfResult.printsRealLast = vcfResult.printsReal
+            sys.stderr.write(' {:14,d}\n'.format( vcfResult.prints ) )
+
+        sys.stderr.flush()
+
     def __str__(self):
         #SL2.40ch01      2118853 .       T       G       222     .       DP=40;AF1=1;CI95=1,1;DP4=0,0,16,23;MQ=60;FQ=-144        GT:PL:DP:GQ     1/1:255,117,0:39:99
 
@@ -374,13 +292,19 @@ class vcfResult(object):
         #outFileHandle << "\n";
 
 
-        srcs = {}
+        srcs   = {}
+        rcount = 0
         #print "RESULTS", self.result
         for register in self.result:
+            rcount += 1
             if register is None:
                 print "register #%d is empty" % register
                 sys.exit( 1 )
 
+            #if rcount % 100 == 0:
+            #    sys.stderr.write("\n")
+            #    sys.stderr.flush()
+
             chrom    = register['chrom'   ]
             posit    = register['pos'     ]
             sourc    = register['src'     ]
@@ -401,7 +325,7 @@ class vcfResult(object):
             if ( vcfResult.excludHET    ) and ( desti.find(',') != -1 ):
                 vcfResult.simpliStats['Heterozygous Dest'] += 1
                 #print "heretozygous: %s" % desti
-                sys.stderr.write('h')
+                self.printprogress('h', key='Heterozygous Dest', skip=vcfResult.print_every)
                 #return ""
                 continue
 
@@ -412,7 +336,7 @@ class vcfResult(object):
                         if len(alt) > 1:
                             vcfResult.simpliStats['Heterozygous Indel'] += 1
                             #print "has het indel: %s > %s" % ( desti, alt )
-                            sys.stderr.write('I')
+                            self.printprogress('I', key='Heterozygous Indel', skip=vcfResult.print_every)
                             #return ""
                             continue
 
@@ -420,7 +344,7 @@ class vcfResult(object):
                     if ( len( desti ) > 1 ):
                         vcfResult.simpliStats['Homozygous Indel'] += 1
                         #print "has hom indel: %s" % desti
-                        sys.stderr.write('i')
+                        self.printprogress('i', key='Homozygous Indel', skip=vcfResult.print_every)
                         #return ""
                         continue
 
@@ -428,7 +352,7 @@ class vcfResult(object):
                     vcfResult.simpliStats['Source Indel'] += 1
                     # todo: if len(src) != len(tgt)
                     #print "not single nucleotide source: %s" % str(register)
-                    sys.stderr.write('s')
+                    self.printprogress('s', key='Source Indel', skip=vcfResult.print_every)
                     #return ""
                     continue
 
@@ -467,12 +391,13 @@ class vcfResult(object):
             for dst in srcs[src]:
                 nv += 1
                 allspps.extend( srcs[src][dst] )
+
         ns = len( set(allspps) )
 
         if (vcfResult.excludeSingleton) and (ns == 1):
             vcfResult.simpliStats['Singleton 1'] += 1
             #print "singleton ns: %s" % str(srcs)
-            sys.stderr.write('1')
+            self.printprogress('1', key='Singleton 1', skip=vcfResult.print_every)
             return ""
 
         for src in sorted( srcs ):
@@ -493,7 +418,7 @@ class vcfResult(object):
                 if (vcfResult.excludeSingleton) and (nu == 1):
                     vcfResult.simpliStats['Singleton 2'] += 1
                     #print "singleton 1: %s" % str(srcs)
-                    sys.stderr.write('2')
+                    self.printprogress('2', key='Singleton 2', skip=vcfResult.print_every)
                     continue
 
                 files_str = ",".join( files )
@@ -505,11 +430,11 @@ class vcfResult(object):
         if len(restr) == 0:
             vcfResult.simpliStats['Singleton 3'] += 1
             #print "singleton l: %s" % str(srcs)
-            sys.stderr.write('3')
-            
+            self.printprogress('3', key='Singleton 3', skip=vcfResult.print_every)
+
         else:
             vcfResult.simpliStats['Ok'] += 1
-            sys.stderr.write('.')
+            self.printprogress('.', key='Ok', skip=vcfResult.print_every)
 
         return restr
 
@@ -550,15 +475,6 @@ class vcfRegister(dict):
         )
         return res
 
-    #chrom    = None
-    #pos      = None
-    #src      = None
-    #dst      = None
-    #desc     = None
-    #state    = None
-    #filename = None
-    #filedesc = None
-    #filecare = None
 
 
 class vcfFile(object):
@@ -610,19 +526,21 @@ class vcfFile(object):
             if len(cols) == 0:
                 self.next()
                 return
+
             elif currLine[0] == "#":
                 if currLine.startswith('##sources='):
                     self.names = currLine[10:].split(';')
                     pp(self.names)
+
                 elif currLine.startswith('##numsources='):
                     numsources = int(currLine[13:])
                     if numsources != len(self.names):
                         print "error parsing sources"
+                        print "num sources", numsources,"!=",len(self.names),sorted(self.names)
                         sys.exit(1)
                     else:
                         print "num sources:",numsources
 
-
                 self.next()
                 return
 
@@ -632,15 +550,55 @@ class vcfFile(object):
                 self.register['src'  ] =     cols[3]
                 self.register['dst'  ] =     cols[4]
 
-		if len( cols ) > 9:
-	                self.register['desc' ] =     cols[9]
-		else:
-			self.register['desc' ] = ""
+                if len( cols ) > 9:
+                    self.register['desc' ] =     cols[9]
+
+                    try:
+                        #print "has desc"
+                        info  = cols[8].split(':')
+
+                    except:
+                        info  = []
+
+                    #print " info", info
+                    if len(info) > 1:
+                        try:
+                            gtpos = info.index('GT')
+
+                        except:
+                            gtpos = None
+
+                        if gtpos is not None:
+                            #print "  GT pos", gtpos
+                            desc  = self.register['desc' ].split(":")
+                            #print "  desc", desc
+                            if len(info) == len(desc):
+                                #print "   len info == len desc", info, desc
+                                gtinfo = desc[gtpos]
+                                #print "   gtinfo", gtinfo
+                                #print "    gt1", gtinfo[ 0]
+                                #print "    gt2", gtinfo[-1]
+                                if any([gt == '0' for gt in (gtinfo[ 0], gtinfo[-1])]):
+                                    #print "has desc"
+                                    #print " info", info
+                                    #print "  GT pos", gtpos
+                                    #print "  desc", desc
+                                    #print "   len info == len desc", info, desc
+                                    #print "   gtinfo", gtinfo
+                                    #print "    gt1", gtinfo[ 0]
+                                    #print "    gt2", gtinfo[-1]
+                                    #print "   adding src to dst", self.register['src'  ], self.register['dst'  ]
+
+                                    self.register['dst'  ] = ",".join(sorted(list(set([self.register['src'  ]] + self.register['dst'  ].split(",")))))
+                                    #print "   added  src to dst", self.register['dst'  ]
+
+                else:
+                    self.register['desc' ] = ""
 
                 self.register['state'] = self.state
 
             except (RuntimeError, TypeError, NameError):
-		print RuntimeError, TypeError, NameError
+                print RuntimeError, TypeError, NameError
                 print "error getting colums", cols, currLine, "\n"
                 #self.next()
                 #return
@@ -707,7 +665,7 @@ class vcfHeap(object):
         """
         Adds file to be taken track of
         """
-	print "adding file to heap"
+        print "adding file to heap"
         filecare = filecare == '1'
         self.filePos[ (fileName, filedesc, filecare) ] = len( self.fileNames )
 
@@ -716,18 +674,18 @@ class vcfHeap(object):
 
         self.fileNames.append( (fileName, filedesc, filecare) )
 
-	print "creating vcf handler"
+        print "creating vcf handler"
         self.filehandle.append( vcfFile( fileName, filedesc, filecare) );
-	print "created vcf handler"
+        print "created vcf handler"
         self.fileStates.append( True );
         self.numOpenFiles += 1
 
-	print "getting first register"
+        print "getting first register"
         firstRegister = self.getRegister( self.filePos[ ( fileName, filedesc, filecare ) ] );
-	print "got first register"
+        print "got first register"
 
         self.heapHead.append( firstRegister );
-	print "added to heap"
+        print "added to heap"
 
     def getFileNames(self):
         """
@@ -917,6 +875,7 @@ class vcfHeap(object):
         """
         Returns a VCF header
         """
+
         header = """\
 ##fileformat=VCFv4.1
 ##fileDate=%s
@@ -930,18 +889,13 @@ class vcfHeap(object):
 ##INFO=<ID=NU,Number=1,Type=Integer,Description=\"Number of Species Having Source and Target Nucleotides\">
 ##FORMAT=<ID=FI,Number=1,Type=String,Description=\"Source Files\">
 #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tFILENAMES
-""" % (self.ctime, ";".join( self.getFileDesc() ), self.getNumOpenFiles() )
+""" % (self.ctime, ";".join( self.getFileDesc() ), self.getNumFiles() )
         return header
 
 
 
-def main():
-    if len(sys.argv) != 2:
-        print "wrong number of arguments. %s <input list>" % sys.argv[0]
-        sys.exit(1)
-
+def main(incsv):
     data       = vcfHeap()
-    incsv      = sys.argv[ 1 ]
     outfile    = incsv + '.vcf.gz'
 
     if not os.path.exists( incsv ):
@@ -974,13 +928,14 @@ def main():
             print "error parsing"
             sys.exit(1)
 
-        data.addFile(*cols)
+        print cols, cols[:3]
+        data.addFile(*cols[:3])
 
 
 
 
 
-    mfh = openvcffile(outfile, 'w', compresslevel=1)
+    mfh = openvcffile(outfile + '.tmp.vcf.gz', 'w', compresslevel=1)
 
     mfh.write( data.getVcfHeader() )
 
@@ -1002,7 +957,17 @@ def main():
 
     mfh.close()
 
+    os.rename(outfile + '.tmp.vcf.gz', outfile)
+
     print "Finished"
 
+    return outfile
+
 if __name__ == '__main__':
-    main()
+    if len(sys.argv) != 2:
+        print "wrong number of arguments. %s <input list>" % sys.argv[0]
+        sys.exit(1)
+
+    incsv      = sys.argv[ 1 ]
+
+    main(incsv)
diff --git a/scripts/median_polish/median_polish.py b/scripts/median_polish/median_polish.py
index 79c84ad..945adf7 100755
--- a/scripts/median_polish/median_polish.py
+++ b/scripts/median_polish/median_polish.py
@@ -5,6 +5,7 @@ import pylab
 from scipy.stats import norm
 import sys
 import os
+import copy
 import argparse
 
 
@@ -62,7 +63,8 @@ class MedianPolish:
         self.x_axis_name = x_axis_name
         self.y_axis_name = y_axis_name
         self.reorder     = reorder
-
+        self.rounds      = []
+        
         self.get_fn_props()
         if title is None:
             self.title   = os.path.basename( self.fname ).replace( os.path.basename( sys.argv[0] ), "" ).replace( '.', ' ' ).replace( '_', ' ' )[:-4]
@@ -125,8 +127,9 @@ class MedianPolish:
         self.col_effects    = np.zeros(shape=self.tbl.shape[1])
         median_row_effects  = 0
         median_col_effects  = 0
-        iter                = 0
-        sm                  = 0
+        iternum             = 0
+        self.sm             = 0
+        self.sm_delta       = sys.maxint
         prev_sm             = -sys.maxint - 1
         func                = np.median
 
@@ -135,7 +138,7 @@ class MedianPolish:
 
         print "method", method
 
-        while ((iter < max_iterations) & (abs(sm - prev_sm) >= eps)):
+        while ((iternum < max_iterations) & (self.sm_delta >= eps)):
             row_medians         = func( self.tbl  , 1 )
             self.row_effects   += row_medians
             median_row_effects  = func( self.row_effects )
@@ -151,24 +154,27 @@ class MedianPolish:
             self.tbl           -= col_medians
             self.grand_effect  += median_col_effects
 
-            prev_sm             = sm
-            sm                  = np.sum(np.absolute(self.tbl))
-            print "method %7s #%3d prev sm %16.4f sm %16.4f eps %5.4f diff %16.4f valid %r" % ( method, iter, prev_sm, sm, eps, abs(sm - prev_sm), (abs(sm - prev_sm) < eps))
-            iter               += 1
+            self.sm             = np.sum(np.absolute(self.tbl))
+            self.sm_delta       = abs(self.sm - prev_sm)
+            print "method %7s #%3d prev sm %16.4f sm %16.4f eps %5.4f diff %16.4f valid %r" % ( method, iternum, prev_sm, self.sm, eps, self.sm_delta, (self.sm_delta < eps))
+            iternum            += 1
+            prev_sm             = self.sm
 
-            if str(intermediates) != str(False):
+            if intermediates:
                 self.getValid()
-                np.savetxt(intermediates+".median.valids_vals.round.%03d.tsv" % iter, self.res_valid_tbl, delimiter="\t")
-                np.savetxt(intermediates+".median.valids_bool.round.%03d.tsv" % iter, self.res_valid    , delimiter="\t")
+                tables, graphs = self.get_vars()
+                self.rounds.append([copy.deepcopy(tables), copy.deepcopy(graphs)])
+                #np.savetxt(str(intermediates)+".median.valids_vals.round.%03d.tsv" % iternum, self.res_valid_tbl, delimiter="\t")
+                #np.savetxt(str(intermediates)+".median.valids_bool.round.%03d.tsv" % iternum, self.res_valid    , delimiter="\t")
 
         self.getValid()
 
     def getValid(self):
-        threshold          = abs(norm.ppf(self.pval))
+        self.threshold          = abs(norm.ppf(self.pval))
         #print "threshold", threshold
 
-        res_mean           = np.average( self.tbl )
-        res_stddev         = np.std(     self.tbl, dtype=np.float32 )
+        self.res_mean           = np.average( self.tbl )
+        self.res_stddev         = np.std(     self.tbl, dtype=np.float32 )
 
         #print "mean", res_mean, "stddev", res_stddev
 
@@ -180,32 +186,29 @@ class MedianPolish:
         #print "table"
         #print self.tbl
 
-        res_valid_tbl      = self.tbl      - res_mean
+        self.res_valid_tbl      = self.tbl      - self.res_mean
         #print "table - mean"
         #print res_valid_tbl
 
-        res_valid_tbl      = res_valid_tbl / res_stddev
+        self.res_valid_tbl      = self.res_valid_tbl / self.res_stddev
         #print "(table - mean) / stddev"
         #print res_valid_tbl
 
-        res_valid          = np.asarray( res_valid_tbl.copy() )
+        self.res_valid          = np.asarray( self.res_valid_tbl.copy() )
 
         #print "mean tbl\n"  , res_mean_tbl
         #print "stddev tbl\n", res_stddev_tbl
         #print "valid tbl\n" , res_valid_tbl
 
-        for i in range(res_valid_tbl.shape[0]):
-            for j in range(res_valid_tbl.shape[1]):
-                if abs(res_valid[i,j]) > threshold:
-                    if res_valid[i,j] > 0:
-                        res_valid[i,j] = 1
+        for i in range(self.res_valid_tbl.shape[0]):
+            for j in range(self.res_valid_tbl.shape[1]):
+                if abs(self.res_valid[i,j]) > self.threshold:
+                    if self.res_valid[i,j] > 0:
+                        self.res_valid[i,j] = 1
                     else:
-                        res_valid[i,j] = 1
+                        self.res_valid[i,j] = 1
                 else:
-                    res_valid[i,j] = 0
-
-        self.res_valid_tbl = res_valid_tbl
-        self.res_valid     = res_valid
+                    self.res_valid[i,j] = 0
 
     def do_reorder(self, tbl):
         if self.reorder is not None:
@@ -400,52 +403,78 @@ class MedianPolish:
 
     def get_vars(self):
         tables =    [
-                        [ "grand_effect", "grand_effect", 'float'],
-                        [ "col_effects" , "col_effects" , 'list' ],
-                        [ "row_effects" , "row_effects" , 'list' ]
+                        [ "grand_effect", "grand_effect", 'float', copy.deepcopy(self.grand_effect) ],
+                        [ "col_effects" , "col_effects" , 'list' , copy.deepcopy(self.col_effects ) ],
+                        [ "row_effects" , "row_effects" , 'list' , copy.deepcopy(self.row_effects ) ],
+                        [ "sm"          , "sm"          , 'float', copy.deepcopy(self.sm          ) ],
+                        [ "sm_delta"    , "sm_delta"    , 'float', copy.deepcopy(self.sm_delta    ) ],
+                        [ "pval"        , "pval"        , 'float', copy.deepcopy(self.pval        ) ],
+                        [ "threshold"   , "threshold"   , 'float', copy.deepcopy(self.threshold   ) ],
+                        [ "res_mean"    , "res_mean"    , 'float', copy.deepcopy(self.res_mean    ) ],
+                        [ "res_stddev"  , "res_stddev"  , 'float', copy.deepcopy(self.res_stddev  ) ],
                     ]
 
-	#		class property     file name      title                                                    scale name       position  position clean
+        #		         class property     file name      title                                                    scale name       position  position clean
         graphs =    [
-                        [ "tbl_org"      , "original"   , "Raw Values"                                           , "Number of SNPs", 141,     121  ],
-                        [ "tbl"          , "residuals"  , "Residuals"                                            , ""              , 142,     None ],
-                        [ "res_valid_tbl", "valids_vals", "Residuals Norm"                                       , "Sigmas"        , 143,     None ],
-                        [ "res_valid"    , "valids_bool", "Residuals Valids ( Alpha = " + str( self.pval ) + " )", None            , 144,     122  ]
+                        [ "tbl_org"      , "original"   , "Raw Values"                                           , "Number of SNPs", 141,     121  , copy.deepcopy(self.tbl_org      ) ],
+                        [ "tbl"          , "residuals"  , "Residuals"                                            , ""              , 142,     None , copy.deepcopy(self.tbl          ) ],
+                        [ "res_valid_tbl", "valids_vals", "Residuals Norm"                                       , "Sigmas"        , 143,     None , copy.deepcopy(self.res_valid_tbl) ],
+                        [ "res_valid"    , "valids_bool", "Residuals Valids ( Alpha = " + str( self.pval ) + " )", None            , 144,     122  , copy.deepcopy(self.res_valid    ) ]
                     ]
 
         return tables, graphs
 
     def export_tables(self, tables, graphs, basename):
-        for var in tables:
-            valN     = var[0]
-            name     = var[1]
-            eType    = var[2]
-            val      = self.__dict__[ valN ]
-
-            bn = "%s.%s.%s%s"    % ( basename, self.method, name, self.ext )
-
-            print "exporting %s" % bn
-
-            if   eType == 'list':
-                np.savetxt( bn, val, delimiter=self.sep )
-
-            elif eType == 'float':
-                with open(bn, 'w') as fhd:
-                    fhd.write(str(val))
-
-        for var in graphs:
-            valN     = var[0]
-            name     = var[1]
-            name_ptr = var[2]
-            do_scale = var[3]
-            val      = self.__dict__[ valN ]
-
-
-            bn = "%s.%s.%s%s"    % ( basename, self.method, name, self.ext )
-
-            print "exporting %s" % bn
-
-            np.savetxt( bn, val, delimiter=self.sep )
+        gbn = "%s.%s.%s%s"    % ( basename, self.method, 'global', self.ext )
+        print 'Saving global %s' % gbn
+        
+        with open(gbn, 'w') as gbhd:
+            gbhd.write('\n%s\n%s\n'  % ('method'   ,self.method))
+        
+            for var in tables:
+                valN     = var[0]
+                name     = var[1]
+                eType    = var[2]
+                val      = var[3]
+                #val      = self.__dict__[ valN ]
+    
+                bn = "%s.%s.%s%s"    % ( basename, self.method, name, self.ext )
+    
+                print "exporting table %s" % bn
+    
+                gbhd.write('\n%s\n' % name)
+                if   eType == 'list':
+                    np.savetxt( bn  , val, delimiter=self.sep )
+                    np.savetxt( gbhd, val, delimiter=self.sep )
+    
+                elif eType == 'float':
+                    with open(bn, 'w') as fhd:
+                        fhd.write( str(val))
+                        gbhd.write(str(val))
+                        
+                gbhd.write('\n')
+    
+            for var in graphs:
+                valN      = var[0] #class property
+                name      = var[1] #file name
+                #name_ptr  = var[2] #title
+                #do_scale  = var[3] #scale name
+                #position  = var[4] # position
+                #positionC = var[5] # position clean
+                val       = var[6] # val
+                #print "VAL", valN, val
+                #val      = self.__dict__[ valN ]
+    
+    
+                bn = "%s.%s.%s%s"    % ( basename, self.method, name, self.ext )
+    
+                print "exporting graph %s" % bn
+                sys.stdout.flush()
+                
+                gbhd.write( '\n%s\n' % name               )
+                np.savetxt( bn  , val, delimiter=self.sep )
+                np.savetxt( gbhd, val, delimiter=self.sep )
+                gbhd.write( '\n'                          )
 
     def export_graph(self, graphs, basename, epitope=None, font_size_ttl=18, font_size_ax=16,font_size_lbl=18, sz=14, szw=14, szh=14, DPI=1200, x_rotate=0, row_names=[], col_names=[], do_pdf=False ):
         #szw = sz*3*3
@@ -499,10 +528,19 @@ class MedianPolish:
             pylab.clf()
 
     def export( self, basename, font_size_ttl=18, font_size_ax=16, font_size_lbl=18, sz=14, DPI=1200, x_rotate=0, row_names=[], col_names=[], do_pdf=False, do_image=True, do_tables=True ):
+        if len(self.rounds) > 0:
+            for r, v in enumerate(self.rounds):
+                tables, graphs = v
+                self._export( basename + ('_round_%d'%r), tables, graphs, font_size_ttl=font_size_ttl, font_size_ax=font_size_ax, font_size_lbl=font_size_lbl, sz=sz, DPI=DPI, x_rotate=x_rotate, row_names=row_names, col_names=col_names, do_pdf=do_pdf, do_image=do_image, do_tables=do_tables)
+
         tables, graphs = self.get_vars()
+        
+        self._export( basename, tables, graphs, font_size_ttl=font_size_ttl, font_size_ax=font_size_ax, font_size_lbl=font_size_lbl, sz=sz, DPI=DPI, x_rotate=x_rotate, row_names=row_names, col_names=col_names, do_pdf=do_pdf, do_image=do_image, do_tables=do_tables)
 
+    def _export(self, basename, tables, graphs, font_size_ttl=18, font_size_ax=16, font_size_lbl=18, sz=14, DPI=1200, x_rotate=0, row_names=[], col_names=[], do_pdf=False, do_image=True, do_tables=True):   
         if do_tables:
             self.export_tables(tables, graphs, basename)
+            
         else:
             print "SKIPPING EXPORTING TABLES"
 
-- 
GitLab