Commit d13faf95 authored by sauloal's avatar sauloal
Browse files

first

parents
*.txt
*.delta
*.tsv
Parser of Bionano Genomics Optical Mapping XMap
~assembly/tomato150/programs/mummer/MUMmer3.23/mummerplot S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt.delta
#!/usr/bin/python
import os
import sys
from collections import defaultdict
from sqlalchemy.util import KeyedTuple
"""
#h XmapEntryID QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence HitEnum QryLen RefLen LabelChannel Alignment
#f int int int float float float float string float string float float int string
1 141 1 528400.6 571697.5 10672 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 "(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
"""
PLUS = 0
MINUS = 1
def col_parse_orientation(val):
assert(val in ("+", "-"))
return val
#if val == "+":
# return PLUS
#
#elif val == "-":
# return MINUS
def col_parse_hit_enum(val):
#4M2D2M
return val
def col_parse_alignment(val):
#"(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
val = val.strip('"')
return val
col_parsers = {
"Orientation": col_parse_orientation,
"HitEnum" : col_parse_hit_enum,
"Alignment" : col_parse_alignment
}
min_confidence = 10
cols_to_index = ["QryContigID", "RefContigID", "Orientation"]
group_by = [
["RefContigID", "QryContigID"],
["RefContigID", "RefStartPos"],
["QryContigID", "RefContigID"],
["QryContigID", "Orientation"],
["QryContigID", "XmapEntryID"],
["XmapEntryID", "Confidence" ],
]
def parse_file(infile):
data = []
names = []
seman = {}
types = []
indexer = {}
groups = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
for cti in cols_to_index:
indexer[cti] = defaultdict(set)
with open(infile, 'r') as fhd:
for line in fhd:
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
if line[1] == "h":
line = line[3:]
names = [x.strip() for x in line.split("\t")]
#print "NAMES", names
for p, n in enumerate(names):
seman[n] = p
elif line[1] == "f":
line = line[3:]
types = [x.strip() for x in line.split("\t")]
for tp in xrange(len(types)):
t = types[tp]
if t == "int":
types[tp] = int
elif t == "float":
types[tp] = float
elif t == "string":
types[tp] = col_parsers[ names[tp] ]
assert(len(types) == len(names))
#print "TYPES", types
continue
cols = [x.strip() for x in line.split("\t") ]
vals = [types[p](cols[p]) for p in xrange(len(cols))]
for ind in indexer:
indexer[ ind ][ vals[seman[ind]] ].add(len(data))
for grp_from, grp_to in group_by:
val_from = vals[seman[grp_from]]
val_to = vals[seman[grp_to ]]
groups[grp_from+'_'+grp_to][val_from][val_to].add(len(data))
data.append(vals)
return data, names, seman, types, indexer, groups
def main(args):
infile = args[0]
MIN_CONFIDENCE = 0
data, names, seman, types, indexer, groups = parse_file(infile)
print "NAMES", names
print "TYPES", types
#print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]]
print "DATA" , data[1]
print "file has %5d maps and %3d chromosomes" % (len(indexer["QryContigID"]), len(indexer["RefContigID"]))
if True:
for RefContigID in sorted(groups["RefContigID_QryContigID"]):
print "chromosome %2d has %4d maps" % ( RefContigID, len(groups["RefContigID_QryContigID"][RefContigID]))
if False:
for QryContigID in sorted(groups["QryContigID_RefContigID"]):
print "query %5d maps to %2d chromosomes" % (QryContigID, len(groups["QryContigID_RefContigID"][QryContigID]))
XmapEntryIDs = groups["QryContigID_XmapEntryID"][QryContigID].keys()
Confidences = [groups["XmapEntryID_Confidence"][x].keys()[0] for x in XmapEntryIDs]
print " confidences ", Confidences
max_confidence = max(Confidences)
print " max confidence ", max_confidence
print " max confidence chrom", data[list(groups["XmapEntryID_Confidence"][XmapEntryIDs[Confidences.index(max_confidence)]][max_confidence])[0]][seman["RefContigID"]]
if False:
with open(infile + ".report.tsv", "w") as fhd:
linecount = 0
for RefContigID in sorted(groups["RefContigID_QryContigID"]):
QryContigIDs = groups["RefContigID_QryContigID"][RefContigID]
for QryContigID in sorted(QryContigIDs):
dataPoses = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
dataVals = [ KeyedTuple(data[x], labels=names)._asdict() for x in dataPoses ]
num_orientations = len(set([x["Orientation"] for x in dataVals]))
num_qry_matches = len(groups["QryContigID_RefContigID"][QryContigID])
num_good_quals = len(set([x["Confidence" ] for x in dataVals if x["Confidence" ] >= MIN_CONFIDENCE]))
#if num_orientations != 1:
#if num_qry_matches <= 3:
#if num_good_quals >= 1:
print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for dataVal in dataVals:
ref_lens = [(x["RefStartPos"], x["RefEndPos"]) for x in dataVals]
ref_no_gap_len = sum([max(x)-min(x) for x in ref_lens])
ref_min_coord = min([min(x) for x in ref_lens])
ref_max_coord = max([max(x) for x in ref_lens])
ref_gap_len = ref_max_coord - ref_min_coord
qry_lens = [(x["QryStartPos"], x["QryEndPos"]) for x in dataVals]
qry_no_gap_len = sum([max(x)-min(x) for x in qry_lens])
qry_min_coord = min([min(x) for x in qry_lens])
qry_max_coord = max([max(x) for x in qry_lens])
qry_gap_len = qry_max_coord - qry_min_coord
XmapEntryIDs = groups["QryContigID_XmapEntryID"][QryContigID].keys()
Confidences = [groups["XmapEntryID_Confidence"][x].keys()[0] for x in XmapEntryIDs]
max_confidence = max(Confidences)
max_confidence_chrom = data[list(groups["XmapEntryID_Confidence"][XmapEntryIDs[Confidences.index(max_confidence)]][max_confidence])[0]][seman["RefContigID"]]
dataVal["_meta_num_orientations" ] = num_orientations
dataVal["_meta_num_qry_matches" ] = num_qry_matches
dataVal["_meta_num_good_confidence_%d"%MIN_CONFIDENCE] = num_good_quals
dataVal["_meta_len_ref_match_no_gap" ] = ref_no_gap_len
dataVal["_meta_len_ref_match_gapped" ] = ref_gap_len
dataVal["_meta_len_qry_match_no_gap" ] = qry_no_gap_len
dataVal["_meta_len_qry_match_gapped" ] = qry_gap_len
dataVal["_meta_proportion_sizes_no_gap" ] = (ref_no_gap_len * 1.0)/ qry_no_gap_len
dataVal["_meta_proportion_sizes_gapped" ] = (ref_gap_len * 1.0)/ qry_gap_len
dataVal["_meta_proportion_query_len_no_gap" ] = (qry_no_gap_len * 1.0)/ dataVal["QryLen"]
dataVal["_meta_proportion_query_len_gapped" ] = (qry_gap_len * 1.0)/ dataVal["QryLen"]
dataVal["_meta_max_confidence_for_qry" ] = max_confidence
dataVal["_meta_max_confidence_for_qry_chrom" ] = max_confidence_chrom
dataVal["_meta_is_max_confidence_for_qry_chrom" ] = max_confidence_chrom == RefContigID
if linecount == 0:
fhd.write("\t".join(sorted(dataVal)) + "\n")
print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
fhd.write("\t".join( [ str(dataVal[x]) for x in sorted(dataVal)]) + "\n")
linecount += 1
print
if True:
with open(infile + ".delta", "w") as fhd:
linecount = 0
fhd.write("/home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_lycopersicum_heinz/SL2.40ch12.fa /home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_pennellii_scaffold/final.assembly.fasta\n")
fhd.write("NUCMER\n")
sum_ref_len = 0
sum_qry_len = 0
done_QryContigID = {}
for RefContigID in sorted(groups["RefContigID_RefStartPos"]):
RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID]
for RefStartPosG in sorted(RefStartPoses):
pos_rows = list(RefStartPoses[RefStartPosG])
for pos_row_pos in pos_rows:
pos_row = data[pos_row_pos]
pos_row = KeyedTuple(pos_row, labels=names)._asdict()
Confidence_row = pos_row["Confidence"]
if ( Confidence_row < MIN_CONFIDENCE ):
continue
QryContigID = pos_row["QryContigID"]
if QryContigID not in done_QryContigID:
done_QryContigID[QryContigID] = {}
first_time = True
if RefContigID in done_QryContigID[QryContigID]:
continue
else:
if len(done_QryContigID[QryContigID]) == 0:
done_QryContigID[QryContigID][RefContigID] = sum_qry_len
else:
done_QryContigID[QryContigID][RefContigID] = done_QryContigID[QryContigID][done_QryContigID[QryContigID].keys()[0]]
first_time = False
sum_qry_len_local = done_QryContigID[QryContigID][RefContigID]
line_count = 0
qry_rows = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
for qry_row_pos in qry_rows:
qry_row = data[qry_row_pos]
qry_row = KeyedTuple(qry_row, labels=names)._asdict()
Confidence_qry = qry_row["Confidence"]
if ( Confidence_qry < MIN_CONFIDENCE ):
continue
RefStartPos = qry_row["RefStartPos"] + sum_ref_len
RefEndPos = qry_row["RefEndPos" ] + sum_ref_len
RefLen = qry_row["RefLen" ]
QryStartPos = qry_row["QryStartPos"] + sum_qry_len_local
QryEndPos = qry_row["QryEndPos" ] + sum_qry_len_local
QryLen = qry_row["QryLen" ]
num_errors = 0
sim_erros = 0
stop_codons = 0
header = [int(RefContigID), int(QryContigID), int(RefLen ), int(QryLen ) ]
line = [int(RefStartPos), int(RefEndPos ), int(QryStartPos), int(QryEndPos), num_errors, sim_erros, stop_codons]
if line_count == 0:
fhd.write(">" + " ".join([str(x) for x in header]) + "\n")
fhd.write( " ".join([str(x) for x in line ]) + "\n0\n")
line_count += 1
if first_time:
sum_qry_len += QryLen
sum_ref_len += RefLen
#break
if __name__ == '__main__':
if len(sys.argv) ==1:
print "no arguments given"
sys.exit(1)
main(sys.argv[1:])
"""
# $ cd D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports; C:\Program Files\BioNano Genomics\RefAligner\WindowsRefAligner.exe -f -ref D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\S_lycopersicum_chromosomes.2.50.BspQI-BbvCI.cmap -i D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\EXP_REFINEFINAL1.cmap -o S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1 -endoutlier 1e-2 -outlier 1e-4 -extend 1 -FN 0.08 -FP 0.8 -sf 0.2 -sd 0 -sr 0.02 -res 2.9 -resSD 0.7 -mres 2.0 -A 5 -biaswt 0 -M 1 -Mfast 0 -maxmem 2 -T 1e-6 -stdout -stderr
# r3498 $Header: http://svn.bnm.local:81/svn/informatics/RefAligner/branches/3480/RefAligner.cpp 3470 2014-12-17 19:29:21Z tanantharaman $
# FLAGS: USE_SSE=0 USE_AVX=0 USE_MIC=0 USE_PFLOAT=1 USE_RFLOAT=1 DEBUG=1 VERB=1
# XMAP File Version: 0.2
# Label Channels: 1
# Reference Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_r.cmap
# Query Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_q.cmap
#h XmapEntryID QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence HitEnum QryLen RefLen LabelChannel Alignment
#f int int int float float float float string float string float float int string
1 141 1 528400.6 571697.5 10672 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 "(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
2 174 1 21236.5 1568390 10672 1553561 + 79.35 2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D1M1D6M1D1M1D1M2D2M2D1M1I1D1M1D5M2D4M2D1M2D2M1D2M1D3M1D1M1D2M3I3D1M1D1M3D2M3D1M2I1D1M2D1M1D1M1I2D3M2I1M1D2M1D1M1D1M2I3D3M3D1M2D1M1D1M1D5M2D12M 1568410 21805821 1 "(1,2)(2,2)(3,3)(6,4)(7,4)(9,5)(11,6)(12,7)(13,8)(14,9)(15,11)(16,12)(18,13)(19,14)(20,15)(21,15)(24,18)(25,19)(26,20)(27,21)(28,22)(29,23)(30,24)(31,25)(32,26)(33,30)(34,31)(35,32)(37,33)(38,34)(39,35)(40,36)(41,37)(42,38)(44,39)(45,40)(47,41)(48,41)(50,42)(51,43)(52,44)(53,45)(54,46)(55,47)(57,48)(59,49)(60,50)(62,50)(63,51)(66,52)(68,54)(69,55)(70,55)(71,56)(72,57)(73,58)(74,59)(76,60)(77,60)(78,61)(79,62)(80,63)(82,64)(83,64)(86,65)(87,66)(89,67)(90,68)(92,69)(93,70)(94,71)(95,72)(96,72)(98,73)(99,74)(103,78)(105,79)(109,80)(110,81)(111,82)(114,82)(116,85)(119,86)(120,87)(121,87)(124,89)(125,90)(126,91)(127,94)(128,95)(129,95)(130,96)(132,97)(134,98)(138,101)(139,102)(140,103)(143,104)(144,104)(146,105)(147,105)(149,106)(151,107)(152,108)(153,109)(154,110)(155,111)(158,112)(159,113)(160,114)(161,115)(162,116)(163,117)(164,118)(165,119)(166,120)(167,121)(168,122)(169,123)"
"""
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment