Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Aflitos, Saulo Alves
opticalmapping
Commits
76c3b086
Commit
76c3b086
authored
Mar 16, 2015
by
sauloal
Browse files
split scripts
parent
dc899ca0
Changes
6
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
76c3b086
...
...
@@ -2,4 +2,5 @@
*.delta
*.tsv
*.pyc
.~*
out*
om_
pars
er.py
→
om_
augment
er.py
View file @
76c3b086
...
...
@@ -6,130 +6,23 @@ import sys
from
om_shared
import
*
cols_to_index
=
[
"QryContigID"
,
"RefContigID"
,
"Orientation"
]
group_by
=
[
[
"RefContigID"
,
"QryContigID"
],
[
"RefContigID"
,
"RefStartPos"
],
[
"QryContigID"
,
"RefContigID"
],
[
"QryContigID"
,
"Orientation"
],
[
"QryContigID"
,
"XmapEntryID"
],
[
"XmapEntryID"
,
"Confidence"
],
]
def
parse_file
(
infile
,
valid_fields
):
data
=
[]
names
=
[]
seman
=
{}
types
=
[]
indexer
=
{}
headers
=
[]
groups
=
defaultdict
(
lambda
:
defaultdict
(
lambda
:
defaultdict
(
set
)))
ref_maps_from
=
""
query_maps_from
=
""
for
cti
in
cols_to_index
:
indexer
[
cti
]
=
defaultdict
(
set
)
with
open
(
infile
,
'r'
)
as
fhd
:
for
line
in
fhd
:
line
=
line
.
strip
()
if
len
(
line
)
==
0
:
continue
if
line
[
0
]
==
"#"
:
headers
.
append
(
line
)
if
line
[
1
]
==
"h"
:
line
=
line
[
3
:]
names
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)]
#print "NAMES", names
for
p
,
n
in
enumerate
(
names
):
seman
[
n
]
=
p
elif
line
[
1
]
==
"f"
:
line
=
line
[
3
:]
types
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)]
#for tp in xrange(len(types)):
# t = types[tp]
# if t == "int":
# types[tp] = int
# elif t == "float":
# types[tp] = float
# elif t == "string":
# types[tp] = col_parsers[ names[tp] ]
assert
(
len
(
types
)
==
len
(
names
))
#print "TYPES", types
elif
"Reference Maps From:"
in
line
:
# Reference Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_r.cmap
# Query Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_q.cmap
ref_maps_from
=
line
[
23
:].
strip
()
elif
"Query Maps From:"
in
line
:
query_maps_from
=
line
[
23
:].
strip
()
continue
cols
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)
]
vals
=
[
valid_fields
[
'parsers'
][
names
[
p
]](
cols
[
p
])
for
p
in
xrange
(
len
(
cols
))]
for
ind
in
indexer
:
indexer
[
ind
][
vals
[
seman
[
ind
]]
].
add
(
len
(
data
))
for
grp_from
,
grp_to
in
group_by
:
val_from
=
vals
[
seman
[
grp_from
]]
val_to
=
vals
[
seman
[
grp_to
]]
groups
[
grp_from
+
'_'
+
grp_to
][
val_from
][
val_to
].
add
(
len
(
data
))
data
.
append
(
vals
)
return
data
,
headers
,
names
,
seman
,
types
,
indexer
,
groups
,
ref_maps_from
,
query_maps_from
def
parse_args
(
args
):
parser
=
argparse
.
ArgumentParser
(
description
=
"Bionano Genomics MAP parser"
)
parser
.
add_argument
(
'infile'
,
help
=
"MAP file"
)
parser
.
add_argument
(
'-f'
,
'--filter'
,
action
=
'append'
,
help
=
"Filters [Field:Function(lt, le, eq, ne, ge, gt):Value]"
)
parser
.
add_argument
(
'-l'
,
'--list'
,
action
=
'store_true'
,
help
=
"List Values"
)
parser
.
add_argument
(
'-g'
,
'--count'
,
action
=
'store_false'
,
help
=
"Do NOT do global count"
)
parser
.
add_argument
(
'-c'
,
'--conf'
,
action
=
'store_false'
,
help
=
"Do NOT do confidence stats"
)
parser
.
add_argument
(
'-r'
,
'--report'
,
action
=
'store_false'
,
help
=
"Do NOT do report"
)
parser
.
add_argument
(
'-d'
,
'--delta'
,
action
=
'store_false'
,
help
=
"Do NOT do delta"
)
parser
.
add_argument
(
'-m'
,
'--minconf'
,
default
=
10
,
help
=
"Min confidence (default: 10)"
)
parser
.
add_argument
(
'infile'
,
help
=
"MAP file"
)
parser
.
add_argument
(
'-g'
,
'--count'
,
action
=
'store_false'
,
help
=
"DO NOT perform global count"
)
parser
.
add_argument
(
'-c'
,
'--conf'
,
action
=
'store_false'
,
help
=
"DO NOT perform confidence stats"
)
args
=
parser
.
parse_args
(
args
=
args
)
return
args
def
main
(
args
):
valid_fields
=
gen_valid_fields
(
valid_fields_g
)
args
=
parse_args
(
args
)
valid_fields
=
gen_valid_fields
(
valid_fields_g
)
if
args
.
list
:
print
"LIST OF FIELDS"
print
""
,
"
\n
"
.
join
(
[
"%-41s: %-6s : %s"
%
(
valid_field_name
,
valid_fields
[
'types'
][
valid_field_name
],
valid_fields
[
'helps'
][
valid_field_name
])
for
valid_field_name
in
valid_fields
[
'names'
]]
)
sys
.
exit
(
0
)
infile
=
args
.
infile
DO_GLOBAL_COUNT
=
args
.
count
DO_CONFIDENCE_STATS
=
args
.
conf
DO_REPORT
=
args
.
report
DO_DELTA
=
args
.
delta
MIN_CONFIDENCE
=
args
.
minconf
oufile
=
infile
+
".augmented.tsv"
if
not
os
.
path
.
exists
(
infile
):
print
"input file %s does not exists"
%
infile
...
...
@@ -140,21 +33,14 @@ def main(args):
sys
.
exit
(
1
)
filters
=
gen_filter
(
args
,
valid_fields
)
oufile
=
infile
for
field_name
,
field_operator_name
,
field_operator
,
field_value
in
filters
:
oufile
+=
'_'
+
field_name
+
'_'
+
field_operator_name
+
'_'
+
str
(
field_value
)
print
"saving to %s"
%
oufile
data
,
headers
,
names
,
seman
,
types
,
indexer
,
groups
,
ref_maps_from
,
query_maps_from
=
parse_file
(
infile
,
valid_fields
)
print
"NAMES"
,
names
#print "HEADERS", "\n".join( headers )
print
"TYPES"
,
types
#print "HEADERS", "\n".join( headers )
#print "DATA" , data[1]
#print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]]
...
...
@@ -185,14 +71,10 @@ def main(args):
reporter
=
None
if
DO_REPORT
:
print
"CREATING REPORT:"
,
oufile
+
".report.tsv"
reporter
=
open
(
oufile
+
".report.tsv"
,
"w"
)
print
"CREATING REPORT:"
,
oufile
reporter
=
open
(
oufile
,
"w"
)
linecount
=
0
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
QryContigIDs
=
groups
[
"RefContigID_QryContigID"
][
RefContigID
]
...
...
@@ -212,7 +94,6 @@ def main(args):
num_qry_matches
=
len
(
groups
[
"QryContigID_RefContigID"
][
QryContigID
]
)
num_orientations
=
len
(
set
([
x
[
"Orientation"
]
for
x
in
dataVals
])
)
num_good_quals
=
len
(
set
([
x
[
"Confidence"
]
for
x
in
dataVals
if
x
[
"Confidence"
]
>=
MIN_CONFIDENCE
]))
ref_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
ref_lens
]
)
ref_min_coord
=
min
(
[
min
(
x
)
for
x
in
ref_lens
]
)
...
...
@@ -271,7 +152,6 @@ def main(args):
dataVal
[
"_meta_max_confidence_for_qry"
]
=
max_confidence
dataVal
[
"_meta_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
dataVal
[
"_meta_num_good_confidence"
]
=
num_good_quals
dataVal
[
"_meta_num_orientations"
]
=
num_orientations
dataVal
[
"_meta_num_qry_matches"
]
=
num_qry_matches
...
...
@@ -280,113 +160,20 @@ def main(args):
dataVal
[
"_meta_proportion_sizes_gapped"
]
=
(
ref_gap_len
*
1.0
)
/
qry_gap_len
dataVal
[
"_meta_proportion_sizes_no_gap"
]
=
(
ref_no_gap_len
*
1.0
)
/
qry_no_gap_len
data
[
data_pos
]
=
dataVal
filter_res
=
all
([
field_operator
(
dataVal
[
field_name
],
field_value
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value
in
filters
])
if
not
filter_res
:
continue
if
DO_REPORT
:
if
linecount
==
0
:
reporter
.
write
(
"
\n
"
.
join
(
headers
[:
-
2
])
+
"
\n\n
"
)
reporter
.
write
(
"# META_MIN_CONFIDENCE: %d
\n\n
"
%
MIN_CONFIDENCE
)
reporter
.
write
(
"
\n
"
.
join
(
[
"# %-39s: %s"
%
(
x
,
valid_fields
[
'helps_t'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n\n
"
)
reporter
.
write
(
"#h "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
x
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
reporter
.
write
(
"#f "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
valid_fields
[
'types'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter
.
write
(
"
\t
"
.
join
(
[
str
(
dataVal
[
x
])
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
linecount
+=
1
print
if
DO_DELTA
:
print
"CREATING DELTA: "
,
oufile
+
".delta"
with
open
(
oufile
+
".delta"
,
"w"
)
as
fhd
:
linecount
=
0
#fhd.write("/home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_lycopersicum_heinz/SL2.40ch12.fa /home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_pennellii_scaffold/final.assembly.fasta\n")
fhd
.
write
(
"%s %s
\n
"
%
(
ref_maps_from
,
query_maps_from
))
fhd
.
write
(
"NUCMER
\n
"
)
sum_ref_len
=
0
sum_qry_len
=
0
done_QryContigID
=
{}
for
RefContigID
in
sorted
(
groups
[
"RefContigID_RefStartPos"
]):
RefStartPoses
=
groups
[
"RefContigID_RefStartPos"
][
RefContigID
]
RefLen
=
0
for
RefStartPosG
in
sorted
(
RefStartPoses
):
pos_rows
=
list
(
RefStartPoses
[
RefStartPosG
])
if
linecount
==
0
:
reporter
.
write
(
"
\n
"
.
join
(
headers
[:
-
2
])
+
"
\n
#
\n
"
)
reporter
.
write
(
"# FIELDS:
\n
"
)
reporter
.
write
(
"
\n
"
.
join
(
[
"# %-39s: %s"
%
(
x
,
valid_fields
[
'helps_t'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
#
\n
"
)
for
pos_row_pos
in
pos_rows
:
pos_row
=
data
[
pos_row_pos
]
filter_res
=
all
([
field_operator
(
pos_row
[
field_name
],
field_value
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value
in
filters
])
if
not
filter_res
:
continue
QryContigID
=
pos_row
[
"QryContigID"
]
if
QryContigID
not
in
done_QryContigID
:
done_QryContigID
[
QryContigID
]
=
{}
first_time
=
True
if
RefContigID
in
done_QryContigID
[
QryContigID
]:
continue
else
:
if
len
(
done_QryContigID
[
QryContigID
])
==
0
:
done_QryContigID
[
QryContigID
][
RefContigID
]
=
sum_qry_len
else
:
done_QryContigID
[
QryContigID
][
RefContigID
]
=
done_QryContigID
[
QryContigID
][
done_QryContigID
[
QryContigID
].
keys
()[
0
]]
first_time
=
False
sum_qry_len_local
=
done_QryContigID
[
QryContigID
][
RefContigID
]
line_count
=
0
qry_rows
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
])
for
qry_row_pos
in
qry_rows
:
qry_row
=
data
[
qry_row_pos
]
filter_res
=
all
([
field_operator
(
qry_row
[
field_name
],
field_value
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value
in
filters
])
if
not
filter_res
:
continue
RefStartPos
=
qry_row
[
"RefStartPos"
]
+
sum_ref_len
RefEndPos
=
qry_row
[
"RefEndPos"
]
+
sum_ref_len
RefLen
=
qry_row
[
"RefLen"
]
QryStartPos
=
qry_row
[
"QryStartPos"
]
+
sum_qry_len_local
QryEndPos
=
qry_row
[
"QryEndPos"
]
+
sum_qry_len_local
QryLen
=
qry_row
[
"QryLen"
]
num_errors
=
0
sim_erros
=
0
stop_codons
=
0
header
=
[
int
(
RefContigID
),
int
(
QryContigID
),
int
(
RefLen
),
int
(
QryLen
)
]
line
=
[
int
(
RefStartPos
),
int
(
RefEndPos
),
int
(
QryStartPos
),
int
(
QryEndPos
),
num_errors
,
sim_erros
,
stop_codons
]
if
line_count
==
0
:
fhd
.
write
(
">"
+
" "
.
join
([
str
(
x
)
for
x
in
header
])
+
"
\n
"
)
fhd
.
write
(
" "
.
join
([
str
(
x
)
for
x
in
line
])
+
"
\n
0
\n
"
)
reporter
.
write
(
"#h "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
x
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
reporter
.
write
(
"#f "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
valid_fields
[
'types'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
line_count
+=
1
if
first_time
:
sum_qry_len
+=
QryLen
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter
.
write
(
"
\t
"
.
join
(
[
str
(
dataVal
[
x
])
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
sum_ref_len
+=
RefLen
#break
print
linecount
+=
1
...
...
@@ -394,8 +181,10 @@ if __name__ == '__main__':
if
len
(
sys
.
argv
)
==
1
:
print
"no arguments given"
sys
.
exit
(
1
)
main
(
sys
.
argv
[
1
:])
args
=
parse_args
(
sys
.
argv
[
1
:])
main
(
args
)
"""
# $ cd D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports; C:\Program Files\BioNano Genomics\RefAligner\WindowsRefAligner.exe -f -ref D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\S_lycopersicum_chromosomes.2.50.BspQI-BbvCI.cmap -i D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\EXP_REFINEFINAL1.cmap -o S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1 -endoutlier 1e-2 -outlier 1e-4 -extend 1 -FN 0.08 -FP 0.8 -sf 0.2 -sd 0 -sr 0.02 -res 2.9 -resSD 0.7 -mres 2.0 -A 5 -biaswt 0 -M 1 -Mfast 0 -maxmem 2 -T 1e-6 -stdout -stderr
...
...
om_filter.py
0 → 100755
View file @
76c3b086
#!/usr/bin/python
import
os
import
sys
from
om_shared
import
*
def
parse_args
(
args
):
parser
=
argparse
.
ArgumentParser
(
description
=
"Bionano Genomics MAP parser"
)
parser
.
add_argument
(
'infile'
,
help
=
"AUGMENTED file"
)
parser
.
add_argument
(
'-l'
,
'--list'
,
action
=
'store_true'
,
help
=
"List Values"
)
parser
.
add_argument
(
'-f'
,
'--filter'
,
action
=
'append'
,
help
=
"Filters [Field:Function(lt, le, eq, ne, ge, gt):Value]"
)
args
=
parser
.
parse_args
(
args
=
args
)
return
args
def
main
(
args
):
valid_fields
=
gen_valid_fields
(
valid_fields_g
)
infile
=
args
.
infile
if
args
.
list
:
print
"LIST OF FIELDS"
print
""
,
"
\n
"
.
join
(
[
"%-41s: %-6s : %s"
%
(
valid_field_name
,
valid_fields
[
'types'
][
valid_field_name
],
valid_fields
[
'helps'
][
valid_field_name
])
for
valid_field_name
in
valid_fields
[
'names'
]]
)
sys
.
exit
(
0
)
if
not
os
.
path
.
exists
(
infile
):
print
"input file %s does not exists"
%
infile
sys
.
exit
(
1
)
if
os
.
path
.
isdir
(
infile
):
print
"input file %s is a folder"
%
infile
sys
.
exit
(
1
)
filters
=
gen_filter
(
args
,
valid_fields
)
oufile
=
infile
for
field_name
,
field_operator_name
,
field_operator
,
field_value
in
filters
:
oufile
+=
'_'
+
field_name
+
'_'
+
field_operator_name
+
'_'
+
str
(
field_value
)
print
"saving to %s"
%
oufile
data
,
headers
,
names
,
seman
,
types
,
indexer
,
groups
,
ref_maps_from
,
query_maps_from
=
parse_file
(
infile
,
valid_fields
)
print
"NAMES"
,
names
#print "HEADERS", "\n".join( headers )
print
"TYPES"
,
types
#print "DATA" , data[1]
#print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]]
print
"file has %5d maps and %3d chromosomes"
%
(
len
(
indexer
[
"QryContigID"
]),
len
(
indexer
[
"RefContigID"
]))
print
"CREATING REPORT:"
,
oufile
+
".report.tsv"
reporter
=
open
(
oufile
+
".report.tsv"
,
"w"
)
linecount
=
0
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
QryContigIDs
=
groups
[
"RefContigID_QryContigID"
][
RefContigID
]
for
QryContigID
in
sorted
(
QryContigIDs
):
dataPoses
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
])
for
x
in
dataPoses
:
if
isinstance
(
data
[
x
],
dict
):
pass
else
:
data
[
x
]
=
KeyedTuple
(
data
[
x
],
labels
=
names
).
_asdict
()
dataVals
=
[
data
[
x
]
for
x
in
dataPoses
]
ref_lens
=
[
(
x
[
"RefStartPos"
],
x
[
"RefEndPos"
]
)
for
x
in
dataVals
]
qry_lens
=
[
(
x
[
"QryStartPos"
],
x
[
"QryEndPos"
]
)
for
x
in
dataVals
]
num_qry_matches
=
len
(
groups
[
"QryContigID_RefContigID"
][
QryContigID
]
)
num_orientations
=
len
(
set
([
x
[
"Orientation"
]
for
x
in
dataVals
])
)
ref_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
ref_lens
]
)
ref_min_coord
=
min
(
[
min
(
x
)
for
x
in
ref_lens
]
)
ref_max_coord
=
max
(
[
max
(
x
)
for
x
in
ref_lens
]
)
ref_gap_len
=
ref_max_coord
-
ref_min_coord
qry_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
qry_lens
]
)
qry_min_coord
=
min
(
[
min
(
x
)
for
x
in
qry_lens
]
)
qry_max_coord
=
max
(
[
max
(
x
)
for
x
in
qry_lens
]
)
qry_gap_len
=
qry_max_coord
-
qry_min_coord
XmapEntryIDs
=
groups
[
"QryContigID_XmapEntryID"
][
QryContigID
].
keys
()
Confidences
=
[
groups
[
"XmapEntryID_Confidence"
][
x
].
keys
()[
0
]
for
x
in
XmapEntryIDs
]
max_confidence
=
max
(
Confidences
)
confidence_index
=
Confidences
.
index
(
max_confidence
)
confidence_qry
=
XmapEntryIDs
[
confidence_index
]
confidence_pos_set
=
groups
[
"XmapEntryID_Confidence"
][
confidence_qry
][
max_confidence
]
confidence_pos
=
list
(
confidence_pos_set
)[
0
]
if
isinstance
(
data
[
confidence_pos
],
dict
):
pass
else
:
data
[
confidence_pos
]
=
KeyedTuple
(
data
[
confidence_pos
],
labels
=
names
).
_asdict
()
max_confidence_chrom
=
data
[
confidence_pos
][
"RefContigID"
]
#print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for
data_pos
in
dataPoses
:
dataVal
=
data
[
data_pos
]
cigar
=
dataVal
[
"HitEnum"
]
cigar_matches
,
cigar_insertions
,
cigar_deletions
=
process_cigar
(
cigar
)
Alignment
=
dataVal
[
"Alignment"
]
alignment_count_queries
,
alignment_count_refs
,
alignment_count_refs_colapses
,
alignment_count_queries_colapses
=
process_alignment
(
Alignment
)
dataVal
[
"_meta_alignment_count_queries"
]
=
alignment_count_queries
dataVal
[
"_meta_alignment_count_queries_colapses"
]
=
alignment_count_refs_colapses
dataVal
[
"_meta_alignment_count_refs"
]
=
alignment_count_refs
dataVal
[
"_meta_alignment_count_refs_colapses"
]
=
alignment_count_queries_colapses
dataVal
[
"_meta_cigar_deletions"
]
=
cigar_deletions
dataVal
[
"_meta_cigar_insertions"
]
=
cigar_insertions
dataVal
[
"_meta_cigar_matches"
]
=
cigar_matches
dataVal
[
"_meta_is_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
==
RefContigID
dataVal
[
"_meta_len_ref_match_gapped"
]
=
ref_gap_len
dataVal
[
"_meta_len_ref_match_no_gap"
]
=
ref_no_gap_len
dataVal
[
"_meta_len_qry_match_gapped"
]
=
qry_gap_len
dataVal
[
"_meta_len_qry_match_no_gap"
]
=
qry_no_gap_len
dataVal
[
"_meta_max_confidence_for_qry"
]
=
max_confidence
dataVal
[
"_meta_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
dataVal
[
"_meta_num_orientations"
]
=
num_orientations
dataVal
[
"_meta_num_qry_matches"
]
=
num_qry_matches
dataVal
[
"_meta_proportion_query_len_gapped"
]
=
(
qry_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_proportion_query_len_no_gap"
]
=
(
qry_no_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_proportion_sizes_gapped"
]
=
(
ref_gap_len
*
1.0
)
/
qry_gap_len
dataVal
[
"_meta_proportion_sizes_no_gap"
]
=
(
ref_no_gap_len
*
1.0
)
/
qry_no_gap_len
data
[
data_pos
]
=
dataVal
filter_res
=
all
([
field_operator
(
dataVal
[
field_name
],
field_value
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value
in
filters
])
if
not
filter_res
:
continue
if
linecount
==
0
:
reporter
.
write
(
"
\n
"
.
join
(
headers
[:
-
2
])
+
"
\n
#
\n
"
)
reporter
.
write
(
"# FILTERS:
\n
"
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value
in
filters
:
oufile
+=
'_'
+
field_name
+
'_'
+
field_operator_name
+
'_'
+
str
(
field_value
)
reporter
.
write
(
"# %-39s: %3s : %s
\n
"
%
(
field_name
,
field_operator_name
,
str
(
field_value
)
)
)
reporter
.
write
(
"
\n\n
"
)
reporter
.
write
(
"#h "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
x
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
reporter
.
write
(
"#f "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
valid_fields
[
'types'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter
.
write
(
"
\t
"
.
join
(
[
str
(
dataVal
[
x
])
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
linecount
+=
1
print
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
==
1
:
print
"no arguments given"
sys
.
exit
(
1
)
args
=
parse_args
(
sys
.
argv
[
1
:])
main
(
args
)
\ No newline at end of file
om_shared.py
View file @
76c3b086
...
...
@@ -11,6 +11,15 @@ from sqlalchemy.util import KeyedTuple
1 141 1 528400.6 571697.5 10672 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 "(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
"""
cols_to_index
=
[
"QryContigID"
,
"RefContigID"
,
"Orientation"
]
group_by
=
[
[
"RefContigID"
,
"QryContigID"
],
[
"RefContigID"
,
"RefStartPos"
],
[
"QryContigID"
,
"RefContigID"
],
[
"QryContigID"
,
"Orientation"
],
[
"QryContigID"
,
"XmapEntryID"
],
[
"XmapEntryID"
,
"Confidence"
],
]
re_matches
=
re
.
compile
(
"(\d+)M"
)
re_insertions
=
re
.
compile
(
"(\d+)I"
)
...
...
@@ -47,6 +56,8 @@ def col_parse_bool(val):
def
process_cigar
(
cigar
):
"""
2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D1M1D6M1D1M1D1M2D2M2D1M1I1D1M1D5M2D4M2D1M2D2M1D2M1D3M1D1M1D2M3I3D1M1D1M3D2M3D1M2I1D1M2D1M1D1M1I2D3M2I1M1D2M1D1M1D1M2I3D3M3D1M2D1M1D1M1D5M2D12M
...
...
@@ -100,6 +111,8 @@ def process_alignment(alignment):
return
len
(
count_refs
),
len
(
count_queries
),
count_refs_colapses
,
count_queries_colapses
def
gen_valid_fields
(
valid_fields
):
valid_fields
[
'names'
]
=
[
None
]
*
len
(
valid_fields
[
'data'
])
valid_fields
[
'parsers'
]
=
{}
...
...
@@ -145,6 +158,86 @@ def gen_filter(args, valid_fields):
def
parse_file
(
infile
,
valid_fields
):
data
=
[]
names
=
[]
seman
=
{}
types
=
[]
indexer
=
{}
headers
=
[]
groups
=
defaultdict
(
lambda
:
defaultdict
(
lambda
:
defaultdict
(
set
)))
ref_maps_from
=
""
query_maps_from
=
""
for
cti
in
cols_to_index
:
indexer
[
cti
]
=
defaultdict
(
set
)
with
open
(
infile
,
'r'
)
as
fhd
:
for
line
in
fhd
:
line
=
line
.
strip
()
if
len
(
line
)
==
0
:
continue
if
line
[
0
]
==
"#"
:
headers
.
append
(
line
)
if
len
(
line
)
==
1
:
continue
if
line
[
1
]
==
"h"
:
line
=
line
[
3
:]
names
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)]
#print "NAMES", names
for
p
,
n
in
enumerate
(
names
):
seman
[
n
]
=
p
elif
line
[
1
]
==
"f"
:
line
=
line
[
3
:]
types
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)]
#for tp in xrange(len(types)):
# t = types[tp]
# if t == "int":
# types[tp] = int
# elif t == "float":
# types[tp] = float
# elif t == "string":
# types[tp] = col_parsers[ names[tp] ]
assert
(
len
(
types
)
==
len
(
names
))
#print "TYPES", types
elif
"Reference Maps From:"
in
line
:
# Reference Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_r.cmap
# Query Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_q.cmap
ref_maps_from
=
line
[
23
:].
strip
()
elif
"Query Maps From:"
in
line
:
query_maps_from
=
line
[
23
:].
strip
()
continue
<