Commit 76c3b086 authored by sauloal's avatar sauloal
Browse files

split scripts

parent dc899ca0
......@@ -2,4 +2,5 @@
*.delta
*.tsv
*.pyc
.~*
out*
......@@ -6,130 +6,23 @@ import sys
from om_shared import *
cols_to_index = ["QryContigID", "RefContigID", "Orientation"]
group_by = [
["RefContigID", "QryContigID"],
["RefContigID", "RefStartPos"],
["QryContigID", "RefContigID"],
["QryContigID", "Orientation"],
["QryContigID", "XmapEntryID"],
["XmapEntryID", "Confidence" ],
]
def parse_file(infile, valid_fields):
data = []
names = []
seman = {}
types = []
indexer = {}
headers = []
groups = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
ref_maps_from = ""
query_maps_from = ""
for cti in cols_to_index:
indexer[cti] = defaultdict(set)
with open(infile, 'r') as fhd:
for line in fhd:
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
headers.append(line)
if line[1] == "h":
line = line[3:]
names = [x.strip() for x in line.split("\t")]
#print "NAMES", names
for p, n in enumerate(names):
seman[n] = p
elif line[1] == "f":
line = line[3:]
types = [x.strip() for x in line.split("\t")]
#for tp in xrange(len(types)):
# t = types[tp]
# if t == "int":
# types[tp] = int
# elif t == "float":
# types[tp] = float
# elif t == "string":
# types[tp] = col_parsers[ names[tp] ]
assert(len(types) == len(names))
#print "TYPES", types
elif "Reference Maps From:" in line:
# Reference Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_r.cmap
# Query Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_q.cmap
ref_maps_from = line[23:].strip()
elif "Query Maps From:" in line:
query_maps_from = line[23:].strip()
continue
cols = [x.strip() for x in line.split("\t") ]
vals = [valid_fields['parsers'][names[p]](cols[p]) for p in xrange(len(cols))]
for ind in indexer:
indexer[ ind ][ vals[seman[ind]] ].add(len(data))
for grp_from, grp_to in group_by:
val_from = vals[seman[grp_from]]
val_to = vals[seman[grp_to ]]
groups[grp_from+'_'+grp_to][val_from][val_to].add(len(data))
data.append(vals)
return data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from
def parse_args(args):
parser = argparse.ArgumentParser(description="Bionano Genomics MAP parser")
parser.add_argument('infile', help="MAP file" )
parser.add_argument('-f' , '--filter' , action='append' , help="Filters [Field:Function(lt, le, eq, ne, ge, gt):Value]")
parser.add_argument('-l' , '--list' , action='store_true' , help="List Values" )
parser.add_argument('-g' , '--count' , action='store_false', help="Do NOT do global count" )
parser.add_argument('-c' , '--conf' , action='store_false', help="Do NOT do confidence stats" )
parser.add_argument('-r' , '--report' , action='store_false', help="Do NOT do report" )
parser.add_argument('-d' , '--delta' , action='store_false', help="Do NOT do delta" )
parser.add_argument('-m' , '--minconf', default=10 , help="Min confidence (default: 10)" )
parser.add_argument( 'infile', help="MAP file" )
parser.add_argument( '-g' , '--count' , action='store_false', help="DO NOT perform global count" )
parser.add_argument( '-c' , '--conf' , action='store_false', help="DO NOT perform confidence stats" )
args = parser.parse_args(args=args)
return args
def main(args):
valid_fields = gen_valid_fields(valid_fields_g)
args = parse_args(args)
valid_fields = gen_valid_fields(valid_fields_g)
if args.list:
print "LIST OF FIELDS"
print "", "\n ".join( ["%-41s: %-6s : %s"% (valid_field_name, valid_fields['types' ][valid_field_name], valid_fields['helps' ][valid_field_name]) for valid_field_name in valid_fields['names' ]] )
sys.exit(0)
infile = args.infile
DO_GLOBAL_COUNT = args.count
DO_CONFIDENCE_STATS = args.conf
DO_REPORT = args.report
DO_DELTA = args.delta
MIN_CONFIDENCE = args.minconf
oufile = infile + ".augmented.tsv"
if not os.path.exists(infile):
print "input file %s does not exists" % infile
......@@ -140,21 +33,14 @@ def main(args):
sys.exit(1)
filters = gen_filter(args, valid_fields)
oufile = infile
for field_name, field_operator_name, field_operator, field_value in filters:
oufile += '_' + field_name + '_' + field_operator_name + '_' + str(field_value)
print "saving to %s" % oufile
data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields)
print "NAMES" , names
#print "HEADERS", "\n".join( headers )
print "TYPES" , types
#print "HEADERS", "\n".join( headers )
#print "DATA" , data[1]
#print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]]
......@@ -185,14 +71,10 @@ def main(args):
reporter = None
if DO_REPORT:
print "CREATING REPORT:", oufile + ".report.tsv"
reporter = open(oufile + ".report.tsv", "w")
print "CREATING REPORT:", oufile
reporter = open(oufile, "w")
linecount = 0
for RefContigID in sorted(groups["RefContigID_QryContigID"]):
QryContigIDs = groups["RefContigID_QryContigID"][RefContigID]
......@@ -212,7 +94,6 @@ def main(args):
num_qry_matches = len( groups["QryContigID_RefContigID"][QryContigID] )
num_orientations = len( set([x["Orientation"] for x in dataVals]) )
num_good_quals = len( set([x["Confidence" ] for x in dataVals if x["Confidence"] >= MIN_CONFIDENCE]))
ref_no_gap_len = sum( [ max(x)-min(x) for x in ref_lens ] )
ref_min_coord = min( [ min(x) for x in ref_lens ] )
......@@ -271,7 +152,6 @@ def main(args):
dataVal["_meta_max_confidence_for_qry" ] = max_confidence
dataVal["_meta_max_confidence_for_qry_chrom" ] = max_confidence_chrom
dataVal["_meta_num_good_confidence" ] = num_good_quals
dataVal["_meta_num_orientations" ] = num_orientations
dataVal["_meta_num_qry_matches" ] = num_qry_matches
......@@ -280,113 +160,20 @@ def main(args):
dataVal["_meta_proportion_sizes_gapped" ] = (ref_gap_len * 1.0)/ qry_gap_len
dataVal["_meta_proportion_sizes_no_gap" ] = (ref_no_gap_len * 1.0)/ qry_no_gap_len
data[data_pos] = dataVal
filter_res = all([ field_operator( dataVal[field_name], field_value ) for field_name, field_operator_name, field_operator, field_value in filters])
if not filter_res:
continue
if DO_REPORT:
if linecount == 0:
reporter.write("\n".join(headers[:-2]) + "\n\n")
reporter.write("# META_MIN_CONFIDENCE: %d\n\n" % MIN_CONFIDENCE)
reporter.write( "\n".join( [ "# %-39s: %s" % ( x, valid_fields['helps_t'][x] ) for x in valid_fields['names' ] ] ) + "\n\n")
reporter.write("#h " + "\t".join( [ "%-39s" % ( x ) for x in valid_fields['names' ] ] ) + "\n")
reporter.write("#f " + "\t".join( [ "%-39s" % ( valid_fields['types' ][x] ) for x in valid_fields['names' ] ] ) + "\n")
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter.write( "\t".join( [ str(dataVal[x]) for x in valid_fields['names' ] ] ) + "\n")
linecount += 1
print
if DO_DELTA:
print "CREATING DELTA: ", oufile + ".delta"
with open(oufile + ".delta", "w") as fhd:
linecount = 0
#fhd.write("/home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_lycopersicum_heinz/SL2.40ch12.fa /home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_pennellii_scaffold/final.assembly.fasta\n")
fhd.write("%s %s\n" % (ref_maps_from, query_maps_from))
fhd.write("NUCMER\n")
sum_ref_len = 0
sum_qry_len = 0
done_QryContigID = {}
for RefContigID in sorted(groups["RefContigID_RefStartPos"]):
RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID]
RefLen = 0
for RefStartPosG in sorted(RefStartPoses):
pos_rows = list(RefStartPoses[RefStartPosG])
if linecount == 0:
reporter.write("\n".join(headers[:-2]) + "\n#\n")
reporter.write("# FIELDS:\n")
reporter.write( "\n".join( [ "# %-39s: %s" % ( x, valid_fields['helps_t'][x] ) for x in valid_fields['names' ] ] ) + "\n#\n")
for pos_row_pos in pos_rows:
pos_row = data[pos_row_pos]
filter_res = all([ field_operator( pos_row[field_name], field_value ) for field_name, field_operator_name, field_operator, field_value in filters])
if not filter_res:
continue
QryContigID = pos_row["QryContigID"]
if QryContigID not in done_QryContigID:
done_QryContigID[QryContigID] = {}
first_time = True
if RefContigID in done_QryContigID[QryContigID]:
continue
else:
if len(done_QryContigID[QryContigID]) == 0:
done_QryContigID[QryContigID][RefContigID] = sum_qry_len
else:
done_QryContigID[QryContigID][RefContigID] = done_QryContigID[QryContigID][done_QryContigID[QryContigID].keys()[0]]
first_time = False
sum_qry_len_local = done_QryContigID[QryContigID][RefContigID]
line_count = 0
qry_rows = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
for qry_row_pos in qry_rows:
qry_row = data[qry_row_pos]
filter_res = all([ field_operator( qry_row[field_name], field_value ) for field_name, field_operator_name, field_operator, field_value in filters])
if not filter_res:
continue
RefStartPos = qry_row["RefStartPos"] + sum_ref_len
RefEndPos = qry_row["RefEndPos" ] + sum_ref_len
RefLen = qry_row["RefLen" ]
QryStartPos = qry_row["QryStartPos"] + sum_qry_len_local
QryEndPos = qry_row["QryEndPos" ] + sum_qry_len_local
QryLen = qry_row["QryLen" ]
num_errors = 0
sim_erros = 0
stop_codons = 0
header = [int(RefContigID), int(QryContigID), int(RefLen ), int(QryLen ) ]
line = [int(RefStartPos), int(RefEndPos ), int(QryStartPos), int(QryEndPos), num_errors, sim_erros, stop_codons]
if line_count == 0:
fhd.write(">" + " ".join([str(x) for x in header]) + "\n")
fhd.write( " ".join([str(x) for x in line ]) + "\n0\n")
reporter.write("#h " + "\t".join( [ "%-39s" % ( x ) for x in valid_fields['names' ] ] ) + "\n" )
reporter.write("#f " + "\t".join( [ "%-39s" % ( valid_fields['types' ][x] ) for x in valid_fields['names' ] ] ) + "\n" )
line_count += 1
if first_time:
sum_qry_len += QryLen
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter.write( "\t".join( [ str(dataVal[x]) for x in valid_fields['names' ] ] ) + "\n" )
sum_ref_len += RefLen
#break
print
linecount += 1
......@@ -394,8 +181,10 @@ if __name__ == '__main__':
if len(sys.argv) ==1:
print "no arguments given"
sys.exit(1)
main(sys.argv[1:])
args = parse_args(sys.argv[1:])
main(args)
"""
# $ cd D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports; C:\Program Files\BioNano Genomics\RefAligner\WindowsRefAligner.exe -f -ref D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\S_lycopersicum_chromosomes.2.50.BspQI-BbvCI.cmap -i D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\EXP_REFINEFINAL1.cmap -o S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1 -endoutlier 1e-2 -outlier 1e-4 -extend 1 -FN 0.08 -FP 0.8 -sf 0.2 -sd 0 -sr 0.02 -res 2.9 -resSD 0.7 -mres 2.0 -A 5 -biaswt 0 -M 1 -Mfast 0 -maxmem 2 -T 1e-6 -stdout -stderr
......
#!/usr/bin/python
import os
import sys
from om_shared import *
def parse_args(args):
parser = argparse.ArgumentParser(description="Bionano Genomics MAP parser")
parser.add_argument( 'infile', help="AUGMENTED file" )
parser.add_argument( '-l' , '--list' , action='store_true' , help="List Values" )
parser.add_argument( '-f' , '--filter' , action='append' , help="Filters [Field:Function(lt, le, eq, ne, ge, gt):Value]" )
args = parser.parse_args(args=args)
return args
def main(args):
valid_fields = gen_valid_fields(valid_fields_g)
infile = args.infile
if args.list:
print "LIST OF FIELDS"
print "", "\n ".join( ["%-41s: %-6s : %s"% (valid_field_name, valid_fields['types' ][valid_field_name], valid_fields['helps' ][valid_field_name]) for valid_field_name in valid_fields['names' ]] )
sys.exit(0)
if not os.path.exists(infile):
print "input file %s does not exists" % infile
sys.exit(1)
if os.path.isdir(infile):
print "input file %s is a folder" % infile
sys.exit(1)
filters = gen_filter(args, valid_fields)
oufile = infile
for field_name, field_operator_name, field_operator, field_value in filters:
oufile += '_' + field_name + '_' + field_operator_name + '_' + str(field_value)
print "saving to %s" % oufile
data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields)
print "NAMES" , names
#print "HEADERS", "\n".join( headers )
print "TYPES" , types
#print "DATA" , data[1]
#print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]]
print "file has %5d maps and %3d chromosomes" % (len(indexer["QryContigID"]), len(indexer["RefContigID"]))
print "CREATING REPORT:", oufile + ".report.tsv"
reporter = open(oufile + ".report.tsv", "w")
linecount = 0
for RefContigID in sorted(groups["RefContigID_QryContigID"]):
QryContigIDs = groups["RefContigID_QryContigID"][RefContigID]
for QryContigID in sorted(QryContigIDs):
dataPoses = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
for x in dataPoses:
if isinstance(data[x], dict):
pass
else:
data[x] = KeyedTuple(data[x], labels=names)._asdict()
dataVals = [ data[x] for x in dataPoses ]
ref_lens = [ ( x["RefStartPos"], x["RefEndPos"] ) for x in dataVals ]
qry_lens = [ ( x["QryStartPos"], x["QryEndPos"] ) for x in dataVals ]
num_qry_matches = len( groups["QryContigID_RefContigID"][QryContigID] )
num_orientations = len( set([x["Orientation"] for x in dataVals]) )
ref_no_gap_len = sum( [ max(x)-min(x) for x in ref_lens ] )
ref_min_coord = min( [ min(x) for x in ref_lens ] )
ref_max_coord = max( [ max(x) for x in ref_lens ] )
ref_gap_len = ref_max_coord - ref_min_coord
qry_no_gap_len = sum( [ max(x)-min(x) for x in qry_lens ] )
qry_min_coord = min( [ min(x) for x in qry_lens ] )
qry_max_coord = max( [ max(x) for x in qry_lens ] )
qry_gap_len = qry_max_coord - qry_min_coord
XmapEntryIDs = groups["QryContigID_XmapEntryID"][QryContigID].keys()
Confidences = [groups["XmapEntryID_Confidence"][x].keys()[0] for x in XmapEntryIDs]
max_confidence = max(Confidences)
confidence_index = Confidences.index(max_confidence)
confidence_qry = XmapEntryIDs[confidence_index]
confidence_pos_set = groups["XmapEntryID_Confidence"][confidence_qry][max_confidence]
confidence_pos = list(confidence_pos_set)[0]
if isinstance(data[confidence_pos], dict):
pass
else:
data[confidence_pos] = KeyedTuple(data[confidence_pos], labels=names)._asdict()
max_confidence_chrom = data[confidence_pos]["RefContigID"]
#print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for data_pos in dataPoses:
dataVal = data[data_pos]
cigar = dataVal["HitEnum"]
cigar_matches, cigar_insertions, cigar_deletions = process_cigar(cigar)
Alignment = dataVal["Alignment"]
alignment_count_queries, alignment_count_refs, alignment_count_refs_colapses, alignment_count_queries_colapses = process_alignment(Alignment)
dataVal["_meta_alignment_count_queries" ] = alignment_count_queries
dataVal["_meta_alignment_count_queries_colapses" ] = alignment_count_refs_colapses
dataVal["_meta_alignment_count_refs" ] = alignment_count_refs
dataVal["_meta_alignment_count_refs_colapses" ] = alignment_count_queries_colapses
dataVal["_meta_cigar_deletions" ] = cigar_deletions
dataVal["_meta_cigar_insertions" ] = cigar_insertions
dataVal["_meta_cigar_matches" ] = cigar_matches
dataVal["_meta_is_max_confidence_for_qry_chrom" ] = max_confidence_chrom == RefContigID
dataVal["_meta_len_ref_match_gapped" ] = ref_gap_len
dataVal["_meta_len_ref_match_no_gap" ] = ref_no_gap_len
dataVal["_meta_len_qry_match_gapped" ] = qry_gap_len
dataVal["_meta_len_qry_match_no_gap" ] = qry_no_gap_len
dataVal["_meta_max_confidence_for_qry" ] = max_confidence
dataVal["_meta_max_confidence_for_qry_chrom" ] = max_confidence_chrom
dataVal["_meta_num_orientations" ] = num_orientations
dataVal["_meta_num_qry_matches" ] = num_qry_matches
dataVal["_meta_proportion_query_len_gapped" ] = (qry_gap_len * 1.0)/ dataVal["QryLen"]
dataVal["_meta_proportion_query_len_no_gap" ] = (qry_no_gap_len * 1.0)/ dataVal["QryLen"]
dataVal["_meta_proportion_sizes_gapped" ] = (ref_gap_len * 1.0)/ qry_gap_len
dataVal["_meta_proportion_sizes_no_gap" ] = (ref_no_gap_len * 1.0)/ qry_no_gap_len
data[data_pos] = dataVal
filter_res = all([ field_operator( dataVal[field_name], field_value ) for field_name, field_operator_name, field_operator, field_value in filters])
if not filter_res:
continue
if linecount == 0:
reporter.write("\n".join(headers[:-2]) + "\n#\n")
reporter.write("# FILTERS:\n")
for field_name, field_operator_name, field_operator, field_value in filters:
oufile += '_' + field_name + '_' + field_operator_name + '_' + str(field_value)
reporter.write( "# %-39s: %3s : %s\n" % (field_name, field_operator_name, str(field_value) ) )
reporter.write( "\n\n" )
reporter.write("#h " + "\t".join( [ "%-39s" % ( x ) for x in valid_fields['names' ] ] ) + "\n")
reporter.write("#f " + "\t".join( [ "%-39s" % ( valid_fields['types' ][x] ) for x in valid_fields['names' ] ] ) + "\n")
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter.write( "\t".join( [ str(dataVal[x]) for x in valid_fields['names' ] ] ) + "\n")
linecount += 1
print
if __name__ == '__main__':
if len(sys.argv) ==1:
print "no arguments given"
sys.exit(1)
args = parse_args(sys.argv[1:])
main(args)
\ No newline at end of file
<
......@@ -11,6 +11,15 @@ from sqlalchemy.util import KeyedTuple
1 141 1 528400.6 571697.5 10672 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 "(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
"""
cols_to_index = ["QryContigID", "RefContigID", "Orientation"]
group_by = [
["RefContigID", "QryContigID"],
["RefContigID", "RefStartPos"],
["QryContigID", "RefContigID"],
["QryContigID", "Orientation"],
["QryContigID", "XmapEntryID"],
["XmapEntryID", "Confidence" ],
]
re_matches = re.compile("(\d+)M")
re_insertions = re.compile("(\d+)I")
......@@ -47,6 +56,8 @@ def col_parse_bool(val):
def process_cigar(cigar):
"""
2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D1M1D6M1D1M1D1M2D2M2D1M1I1D1M1D5M2D4M2D1M2D2M1D2M1D3M1D1M1D2M3I3D1M1D1M3D2M3D1M2I1D1M2D1M1D1M1I2D3M2I1M1D2M1D1M1D1M2I3D3M3D1M2D1M1D1M1D5M2D12M
......@@ -100,6 +111,8 @@ def process_alignment(alignment):
return len(count_refs), len(count_queries), count_refs_colapses, count_queries_colapses
def gen_valid_fields(valid_fields):
valid_fields['names' ] = [None] * len(valid_fields['data'])
valid_fields['parsers'] = {}
......@@ -145,6 +158,86 @@ def gen_filter(args, valid_fields):
def parse_file(infile, valid_fields):
data = []
names = []
seman = {}
types = []
indexer = {}
headers = []
groups = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
ref_maps_from = ""
query_maps_from = ""
for cti in cols_to_index:
indexer[cti] = defaultdict(set)
with open(infile, 'r') as fhd:
for line in fhd:
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
headers.append(line)
if len(line) == 1:
continue
if line[1] == "h":
line = line[3:]
names = [x.strip() for x in line.split("\t")]
#print "NAMES", names
for p, n in enumerate(names):
seman[n] = p
elif line[1] == "f":
line = line[3:]
types = [x.strip() for x in line.split("\t")]
#for tp in xrange(len(types)):
# t = types[tp]
# if t == "int":
# types[tp] = int
# elif t == "float":
# types[tp] = float
# elif t == "string":
# types[tp] = col_parsers[ names[tp] ]
assert(len(types) == len(names))
#print "TYPES", types
elif "Reference Maps From:" in line:
# Reference Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_r.cmap
# Query Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_q.cmap
ref_maps_from = line[23:].strip()
elif "Query Maps From:" in line:
query_maps_from = line[23:].strip()
continue