Commit 629e8dd6 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

Saulo implemented a SCONS version of the analysis, making it easier to update...

Saulo implemented a SCONS version of the analysis, making it easier to update certain parts, or go on after a crash.
parent 83c59cbc
import os,sys
import atexit
from glob import glob
from pprint import pprint
import shutil
ONLY_PROCESS_IF_DONT_EXISTS = False
##########################
## CHECKING
##########################
env = Environment()
env.SourceCode(".", None)
#env.Decider('timestamp-newer')
env.Decider('timestamp-match')
#
Decider('timestamp-match')
def my_decider(dependency, target, prev_ni):
return not os.path.exists(str(target))
env.Decider(my_decider)
#############
## Scanner ##
#############
#def bamfile_scan(node, env, path):
# return
#bamscan = Scanner(function = bamfile_scan, skeys = ['.bam'])
#env.Append(SCANNERS = bamscan)
############
ASSEMBLY = "/home/assembly/dev_150/assemblies/allpaths_lg_sample_heinz_raw/sl/data/run/ASSEMBLIES/test/final.assembly.fasta"
PREFIX = "allpaths_lg"
#ASSEMBLY = "/home/assembly/dev_150/assemblies/clc-default/clc_contigs.fa"
#PREFIX = "clc"
#ASSEMBLY = "/home/assembly/dev_150/assemblies/allpaths_lg_sample_heinz_raw_with454/sl/data/run/ASSEMBLIES/test/final.assembly.fasta"
#PREFIX = "allpaths_lg_454"
#ASSEMBLY = "/home/assembly/dev_150/assemblies/S_lycopersicum_scaffolds.2.40.fa"
#PREFIX = "heinz"
#ASSEMBLY = "/home/assembly/progs/fermi/heinz/fmdef.p4.fa"
#PREFIX = "fermi"
indexFile = "SConstruct.cfg"
MIN_PE_INS = 320
MAX_PE_INS = 430
MIN_MP_INS = 2435
MAX_MP_INS = 3555
ESTIMATED_GENOME_SIZE = 760000000
OUTPUT_HEADER = PREFIX + '.FRC.'
def addCommand(cmd, data):
src = data['SOURCE']
tgt = data['TARGET']
allOk = True
for key in data:
val = data[key]
if isinstance(val, list) or str(type(val)) == "<class 'SCons.Node.NodeList'>":
valStr = " ".join([str(x) for x in val])
data[key] = valStr
#print " CONVERTING %s FROM %s TO %s" % (key, val, valStr)
if key == "TARGET":
for fileName in val:
if not os.path.exists(str(val)):
print " TARGET %s DOES NOT EXISTS" % str(fileName)
allOk = False
else:
#print " NOT LIST %s VAL %s ('%s')" % (key, val, type(val))
if key == "TARGET" and not os.path.exists(str(val)):
print " TARGET %s DOES NOT EXISTS" % str(val)
allOk = False
#if allOk and ONLY_PROCESS_IF_DONT_EXISTS:
# if isinstance(tgt, list) or str(type(tgt)) == "<class 'SCons.Node.NodeList'>":
# return tgt
# else:
# return [ tgt ]
cmdStr = cmd % data
print " CMD STRING", cmdStr
print " ", "\n ".join(["%s: %s"%(x, data[x]) for x in data]), "\n\n"
res = env.Command(tgt, src, cmdStr)
Decider('timestamp-match')
return res
#DATABASE
bwtFile = "%s.bwt" % PREFIX
bwtRes = addCommand("bwa index -a is -p %(TARGET)s %(SOURCE)s", {"TARGET": bwtFile, "SOURCE": ASSEMBLY})
#SETUP
data = {}
pairData = {}
with open(indexFile, 'r') as i:
for line in i:
if line[0] == "#": continue
if len(line.strip()) == 0 : continue
fileType, pairName, fileName = line.strip().split("\t")
if fileType not in data:
data[fileType] = {}
data[fileType][fileName] = [[],[]]
if fileType not in pairData:
pairData[fileType] = {}
if pairName not in pairData[fileType]:
pairData[fileType][pairName] = []
pairData[fileType][pairName].append(fileName)
#print "DATA", data
#print "PAIR", pairData
oks = []
resMerge = {}
for fileType in data:
print "FILE TYPE", fileType
for fileName in data[fileType]:
print " FILE NAME", fileName
bn = os.path.basename(fileName).replace('.fastq', '')
mapName = "%s.%s.sai" % ( PREFIX, bn )
res = [ fileName ]
if fileType == "MP":
outName = "rc_%s" % bn
#Convert the mate pair data into paired-end format.
res = addCommand("revseq -sequence %(SOURCE)s -outseq %(OUTFILE)s -notag", {"TARGET": outName+'.fastq', "SOURCE": fileName, 'OUTFILE': outName})
mapName = "%s.rc_%s.sai" % ( PREFIX, bn )
print " ", fileType , "REVSEQ", fileName, ">", [str(x) for x in res]
print " ", fileType , "BWA IN ",[str(x) for x in res ]
mpMap = addCommand("bwa aln -t 80 %(PREFIX)s %(SOURCE)s > %(TARGET)s", {"TARGET": mapName, "SOURCE": res, "PREFIX": PREFIX})
print " ", fileType , "BWA OUT",[str(x) for x in mpMap]
data[fileType][fileName][0] = res
data[fileType][fileName][1] = mpMap
pairBams = []
for pairName in pairData[fileType]:
fqs = []
maps = []
for fileName in pairData[fileType][pairName]:
fqs.extend( data[fileType][fileName][0])
maps.extend(data[fileType][fileName][1])
if pairName != "":
pairName = pairName + '.'
if fileType == "MP":
pairName = 'rc_' + pairName
if fileType == "PE":
pairName = 'PE.' + pairName
print " ", fileType , "FQS ", [str(x) for x in fqs ]
print " ", fileType , "MAPS", [str(x) for x in maps]
# Create SAM files of the paired mappings
outSam = "%s.%ssam" % ( PREFIX, pairName )
inSam = maps+fqs
print " ", fileType , "SAMPE IN ",[str(x) for x in inSam]
resSam = addCommand("bwa sampe -P %(PREFIX)s %(SOURCE)s > %(TARGET)s", {'PREFIX': PREFIX, 'SOURCE': inSam, 'TARGET': outSam})
print " ", fileType , "SAMPE OUT", [str(x) for x in resSam]
Depends( resSam, bwtRes )
# Convert result to BAM
outBam = "%s.%sbam" % ( PREFIX, pairName )
resBam = addCommand("samtools view -S %(SOURCE)s -b > %(TARGET)s", {'TARGET': outBam, 'SOURCE': resSam})
print " ", fileType , "BAM OUT", [str(x) for x in resBam]
# Sort the BAM
outBamSort = outBam.replace('.bam', '') + '.sorted.bam'
outBamSortShort = outBam.replace('.bam', '') + '.sorted'
resBamSort = addCommand("samtools sort %(SOURCE)s %(TARGETSHORT)s", { 'TARGET': outBamSort, 'SOURCE': resBam, 'TARGETSHORT': outBamSortShort })
print " ", fileType , "BAM SORT OUT", [str(x) for x in resBamSort]
# Index the BAM
outBamIndex = outBamSort + '.bai'
resBamSortIndex = addCommand("samtools index %(SOURCE)s", {'TARGET': outBamIndex, 'SOURCE': resBamSort})
print " ", fileType , "BAM SORT INDEX OUT", [str(x) for x in resBamSortIndex]
# create ok file to create dependency
#resAll = resSam+resBam+resBamSort+resBamSortIndex
#print " ", fileType , "RES ALL IN",[str(x) for x in resAll]
#outOk = "%s.%sok" % ( PREFIX, pairName )
#resOk = addCommand("echo %(SOURCE)s > %(TARGET)s", {'TARGET': outOk, 'SOURCE': resAll})
#oks.extend(resOk)
pairBams.extend(resBamSort)
#print " ", fileType , "PAIR BAMS",[str(x) for x in pairBams]
# Create SAM files of the paired mappings
if len(pairBams) > 1:
outMergePair = "%s.%s.bam" % ( PREFIX, fileType )
print " ", fileType , "MERGE PAIR BAMS IN ",[str(x) for x in pairBams ]
outMergePairRes = addCommand("samtools merge -r -1 %(TARGET)s %(SOURCE)s", {'TARGET': outMergePair, 'SOURCE': pairBams})
print " ", fileType , "MERGE PAIR BAMS OUT",[str(x) for x in outMergePairRes]
resMerge[fileType] = outMergePairRes
else:
#print " ", fileType , "MERGE PAIR BAMS OUT",[str(x) for x in outMergePair]
resMerge[fileType] = pairBams
# create ok file to create dependency
#print " RES OKS",[str(x) for x in oks]
#outOkAll = "%s.%s.ok" % ( PREFIX, 'all' )
#resOkAll = addCommand("echo %(SOURCE)s > %(TARGET)s", {'TARGET': outOkAll, 'SOURCE': oks})
# run FRC as per README
#FRC --pe-sam ${PREFIX}.PE.sorted.bam \
# --pe-min-insert $MIN_PE_INS \
# --pe-max-insert $MAX_PE_INS \
# --mp-sam ${PREFIX}.MP.bam \
# --mp-min-insert $MIN_MP_INS \
# --mp-max-insert $MAX_MP_INS \
# --genome-size $ESTIMATED_GENOME_SIZE \
# --output $OUTPUT_HEADER
outFRC = OUTPUT_HEADER + '_FRC.txt'
FCRcmd = "FCR \
--pe-sam %(pesam)s \
--pe-min-insert %(peminins)d \
--pe-max-insert %(pemaxins)d \
--mp-sam %(mpsam)s \
--mp-min-insert %(mpminins)d \
--mp-max-insert %(mpmaxins)d \
--genome-size %(gensize)d \
--output %(output)s" % {
'pesam' : resMerge['PE'][0],
'peminins': MIN_PE_INS,
'pemaxins': MAX_PE_INS,
'mpsam' : resMerge['MP'][0],
'mpminins': MIN_MP_INS,
'mpmaxins': MAX_MP_INS,
'gensize' : ESTIMATED_GENOME_SIZE,
'output' : OUTPUT_HEADER
}
print FCRcmd
allMerged = []
for x in resMerge:
allMerged.extend(resMerge[x])
outMerged = addCommand(FCRcmd, {'TARGET': outFRC, 'SOURCE': allMerged })
# create ok file to create dependency
#outOkEnd = "%s.%s.ok" % ( PREFIX, 'end' )
#resOkEnd = addCommand("echo %(SOURCE)s > %(TARGET)s", {'TARGET': outOkEnd, 'SOURCE': resOkAll + outMerged})
Default(env.Alias('all'))
#env.Alias( 'all', resOkEnd )
env.Alias( 'all', outMerged )
def bf_to_str(bf):
"""Convert an element of GetBuildFailures() to a string
in a useful way."""
import SCons.Errors
if bf is None: # unknown targets product None in list
return '(unknown tgt)'
elif isinstance(bf, SCons.Errors.StopError):
return str(bf)
elif bf.node:
return str(bf.node) + ': ' + bf.errstr + "\nEXECUTOR: " + str(bf.executor) + "\nACTION: " + str(bf.action)
elif bf.filename:
return bf.filename + ': ' + bf.errstr + "\nCOMMAND: " + bf.command + "\nEXECUTOR: " + str(bf.executor) + "\nACTION: " + str(bf.action)
return 'unknown failure: ' + bf.errstr
def build_status():
"""Convert the build status to a 2-tuple, (status, msg)."""
from SCons.Script import GetBuildFailures
bf = GetBuildFailures()
if bf:
# bf is normally a list of build failures; if an element is None,
# it's because of a target that scons doesn't know anything about.
status = 'failed'
failures_message = "\n".join(["Failed building %s" % bf_to_str(x)
for x in bf if x is not None])
else:
# if bf is None, the build completed successfully.
status = 'ok'
failures_message = ''
return (status, failures_message)
def display_build_status():
"""Display the build status. Called by atexit.
Here you could do all kinds of complicated things."""
status, failures_message = build_status()
if status == 'failed':
print "FAILED!!!!" # could display alert, ring bell, etc.
elif status == 'ok':
print "Build succeeded."
print failures_message
atexit.register(display_build_status)
#
#sys.exit(0)
#file type pair name file name
#MP
MP 3018DAAXX_2 /home/assembly/dev_150/sample_heinz/raw/illumina/MP_3000/3018DAAXX_2_f.fastq
MP 3018DAAXX_2 /home/assembly/dev_150/sample_heinz/raw/illumina/MP_3000/3018DAAXX_2_r.fastq
MP 30THBAAXX_2 /home/assembly/dev_150/sample_heinz/raw/illumina/MP_3000/30THBAAXX_2_f.fastq
MP 30THBAAXX_2 /home/assembly/dev_150/sample_heinz/raw/illumina/MP_3000/30THBAAXX_2_r.fastq
#PE
PE /home/assembly/dev_150/sample_heinz/raw/illumina/PE_500/sample_R1_001.64.fastq
PE /home/assembly/dev_150/sample_heinz/raw/illumina/PE_500/sample_R2_001.64.fastq
#!/usr/bin/python
import sys,os
import datetime
ignore_list = ['/bin/echo']
replace = [['/mnt/nexenta/aflit001/nobackup/Data/', ''], ['Data/', ''], ['out/', ''], ['data/', '']]
def parseStatus(status):
#01234567891
# E = exists
# R = exists in repository only
# b = implicit builder
# B = explicit builder
# S = side effect
#01234567891
# P = precious
# A = always build
# C = current
# N = no clean
# H = no cache
#01234567891
statusF = 0
if status[0] == "E": statusF += 1
if status[1] == "R": statusF += 2
if status[2] == "b": statusF += 4
if status[3] == "B": statusF += 4
if status[3] == "S": statusF += 8
if status[4] == "P": statusF += 16
if status[5] == "A": statusF += 32
if status[6] == "C": statusF += 64
if status[7] == "N": statusF += 128
if status[8] == "H": statusF += 256
return statusF
def clean_for_dot(expr):
expr = expr.strip()
for rep in replace:
#print "REPLACING",rep[0],'to',rep[1],'in',expr,
expr = expr.replace(rep[0],rep[1])
#print " BECOMING",expr
expr = expr.replace("/","\/").replace("-","\-").replace("_","\_")
if expr[0] == "[": expr = expr[1: ]
if expr[-1] == "]": expr = expr[:-1]
return expr
class node(object):
def __init__(self,path, status, depth):
self.path = path
self.status = status
self.depth = depth
self.children = []
self.statusF = parseStatus(self.status)
def add_child(self,child):
if not child in self.children: self.children.append(child)
def print_rec(self,depth):
print " "*depth,self.name
for c in self.children:
c.print_rec(depth+1)
def get_name(self):
return self.path
def get_label_short(self):
return self.path
def get_status(self):
return self.statusF
def get_depth(self):
return self.depth
def get_status_color(self):
color = 'white'
if (self.statusF % 2 ) != 0:
color = 'green'
return color
def is_fixed_tool(self):
return self.path in ignore_list
class scons():
def __init__(self):
self.stack = []
self.map = {}
def printOut(self):
o = open('SConstruct.tree.dot', 'w')
def wl(s): o.write("%s\n"%s)
wl("digraph G {")
##wl("""size="100x100" """)
##wl("""rankdir=TB """)
wl("raksep=equally")
#wl("nodesep=0.8")
wl("pagedir=BL")
wl("splines=true")
wl("concentrate=true")
wl("fontname=Helvetica")
wl("remincross=true")
wl("searchsize=100")
#wl("""rankdir=RL """)
#wl("ratio = 1")
#wl("ratio = compress")
wl("ratio = auto")
#Nodes:
for obj in self.map.values():
if not obj.is_fixed_tool():
#wl("""\"%s\" [label=\"%s\", fontsize=16, style=filled, fillcolor=%s]"""%(scons_obj.get_name(),scons_obj.get_label_short(),scons_obj.get_status_color()) )
#wl("""\"%s\" [label=\"%s\", nodesep=0.75, ranksep=0.75, shape=box, weight=1.2, fontsize=16, style=filled, fillcolor=%s]"""%(scons_obj.get_name(),scons_obj.get_label_short(),scons_obj.get_status_color()) )
wl("""\"%s\" [label=\"%s\", group=%s, rank=%s, shape=box, fontsize=24, style=filled, fillcolor=%s]"""%(obj.get_name(),obj.get_label_short(), obj.get_depth(), obj.get_depth(), obj.get_status_color()) )
#Connections:
for obj in self.map.values():
if not obj.is_fixed_tool() :
children = obj.children
if len(children) == 0:
continue
names = []
exts = []
for c in children:
if not c.is_fixed_tool():
name = c.get_name()
ext = os.path.splitext(name)
if len(ext) > 0:
ext = ext[1][1:]
names.append(name)
exts.append(ext)
nameext = zip(exts,names,children)
#sys.stderr.writelines("NAMEEXT 1"+str(nameext))
nameext.sort()
#sys.stderr.writelines("NAMEEXT 2"+str(nameext))
(sortedNames, sortedExts, sortedChildren) = zip( *nameext )
for c in sortedChildren:
if not c.is_fixed_tool():
wl("\"%s\" -> \"%s\""%(c.get_name(), obj.get_name() ) )
#wl("\"%s\" -> \"%s\""%(scons_obj.get_name(), c.get_name() ) )
wl("}")
def initial_filter(datalines):
return [d for d in datalines if d.strip() and d.strip()[0] in ["["] ]
class htmlTree():
def __init__(self):
self.lines = []
def checkRoot(self, str):
if str[0] == "[" and str[-1] == "]":
return False
else:
return True
def add(self, statusStr, pre, pos, nameClean, depth):
status = parseStatus(statusStr)
position = len(self.lines)
self.lines.append([status, pre, nameClean, depth, self.checkRoot(pos)])
def export(self):
lines = []
lines.append("<html>")
#www.perbang.dk/rgbgradient/ AD0000 E5E5FF 12 steps
lines.append("""
<style type="text/css">
p {
margin: 0px;
font-family: monospace;
white-space: nowrap;
display: inline;
}
p.parent { font-weight: 900 }
p.child { font-weight: 100 }
p.run { color: #00AA00 }
p.notrun { color: #FF0000 }
p { display: block }
</style>
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
<script type="text/javascript">
function hide(name) {
$('.'+name).hide();
//alert('hiding '+name)
}
function show(name) {
$('.'+name).show();
//alert('showing '+name)
}
</script>
""")
#p.depth01 { background-color: #000000 }
#p.depth02 { background-color: #333333 }
#p.depth03 { background-color: #555555 }
#p.depth04 { background-color: #666666 }
#p.depth05 { background-color: #7A7A7A }
#p.depth06 { background-color: #8B8B8B }
#p.depth07 { background-color: #999999 }
#p.depth08 { background-color: #AAAAAA }
#p.depth09 { background-color: #BBBBBB }
#p.depth10 { background-color: #CCCCCC }
#p.depth11 { background-color: #DDDDDD }
#p.depth12 { background-color: #EEEEEE }
now = str(datetime.datetime.now())
print now
lines.append(" <body>")
lines.append("""
<table>
<tr>
<td>
Hide
</td>
<td>
<input type="button" onclick="hide('parent')" value="parent"/>
</td>
<td>
<input type="button" onclick="hide('child')" value="child"/>
</td>
<td>
<input type="button" onclick="hide('run')" value="run"/>
</td>
<td>
<input type="button" onclick="hide('notrun')" value="not run"/>
</td>
</tr>
<tr>
<td>
Show
</td>
<td>
<input type="button" onclick="show('parent')" value="parent"/>
</td>
<td>
<input type="button" onclick="show('child')" value="child"/>
</td>
<td>
<input type="button" onclick="show('run')" value="run"/>
</td>
<td>
<input type="button" onclick="show('notrun')" value="not run"/>
</td>
</tr>
</table>
""")
lines.append("LABEL:<br/>")
lines.append(' <p class="parent run" >'+'Finished Running - Original File'.replace(' ', '&nbsp;')+'</p>')
lines.append(' <p class="child run" >'+'Finished Running - Copy File'.replace(' ', '&nbsp;')+'</p>')
lines.append(' <p class="parent notrun" >'+'Not Run - Original File'.replace(' ', '&nbsp;')+'</p>')
lines.append(' <p class="child notrun" >'+'Not Run - Copy File'.replace(' ', '&nbsp;')+'</p>')
lines.append(' <p>'+now+'</p>')
#lines.append(' <p >Level:</p>')
#for p in range(1, 13):
# lines.append(' <p class="parent run depth%02d">&nbsp;%02d&nbsp;</p>' % (p ,p))
lines.append(' <br/><br/>')
for line in self.lines:
status = line[0]
pre = line[1]
name = line[2]
depth = line[3]
parent = line[4]
refname = name.replace('\/', '').replace('\\', '').replace('.', '')
name = name.replace('\\', '')
pre = pre.replace(' ', '&nbsp;')
linkStr = ''
parentHood = ''
runStatus = ''