Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Aflitos, Saulo Alves
opticalmapping
Commits
d26ea393
Commit
d26ea393
authored
Mar 17, 2015
by
sauloal
Browse files
stats post filtering
parent
9a88e383
Changes
6
Hide whitespace changes
Inline
Side-by-side
agp_to_gff.py
View file @
d26ea393
...
...
@@ -4,6 +4,10 @@ import os
import
sys
from
sqlalchemy.util
import
KeyedTuple
#grep -P '#|\tgap' SL2.50ch_from_sc.agp.gff3 > SL2.50ch_from_sc.agp.gff3.gap.gff3
#grep -P '#|\tcontig' SL2.50ch_from_sc.agp.gff3 > SL2.50ch_from_sc.agp.gff3.contig.gff3
#grep -P '#|\tscaffold' SL2.50ch_from_sc.agp.gff3 > SL2.50ch_from_sc.agp.gff3.scaffold.gff3
flags_gap
=
(
'N'
,
'U'
)
col_names_shared
=
[
'object_id'
,
'object_beg'
,
'object_end'
,
'part_number'
,
'component_type'
]
...
...
fasta_gap_to_gff.py
View file @
d26ea393
...
...
@@ -11,7 +11,7 @@ source_type = "gap"
id_prefix
=
"fastagap_"
score
,
orientation
,
phase
=
[
'.'
,
'.'
,
'.'
]
def
parse_seq
(
ofh
,
seq_name
,
seq_seq
):
def
parse_seq
(
ofh
,
seq_name
,
seq_seq
,
min_size
=
1
):
if
len
(
seq_seq
)
==
0
:
return
...
...
@@ -20,13 +20,16 @@ def parse_seq(ofh, seq_name, seq_seq):
print
"saving chromosome"
,
seq_name
,
"len"
,
len
(
seq_seq
)
hit_num
=
1
for
m
in
re_ns
.
finditer
(
seq_seq
.
lower
()):
start_pos
=
m
.
start
()
start_pos
=
m
.
start
()
+
1
end_pos
=
m
.
end
()
match_seq
=
m
.
group
()
diff_pos
=
end_pos
-
start_pos
match_len
=
len
(
match_seq
)
#print seq_name, start_pos, end_pos, diff_pos, match_len, match_seq
if
match_len
<
min_size
:
continue
row_id
=
id_prefix
+
seq_name
+
'_'
+
str
(
hit_num
)
attributes
=
"ID=%s;Name=%s;length=%d"
%
(
row_id
,
row_id
,
diff_pos
)
...
...
@@ -38,13 +41,19 @@ def parse_seq(ofh, seq_name, seq_seq):
hit_num
+=
1
def
main
(
args
):
infasta
=
args
[
0
]
outgff
=
infasta
+
'.gff3'
infasta
=
args
[
0
]
min_size
=
1
if
len
(
args
)
>
1
:
min_size
=
int
(
args
[
1
])
outgff
=
infasta
+
'.gff3'
with
open
(
infasta
,
'r'
)
as
ifh
:
with
open
(
outgff
,
'w'
)
as
ofh
:
ofh
.
write
(
"##gff-version 3
\n
"
)
ofh
.
write
(
"#infile: %s
\n
"
%
infasta
)
ofh
.
write
(
"#infile : %s
\n
"
%
infasta
)
ofh
.
write
(
"#min_size: %d
\n
"
%
min_size
)
seq_name
=
None
seq_seq
=
""
...
...
@@ -56,7 +65,7 @@ def main(args):
if
line
[
0
]
==
">"
:
if
seq_name
is
not
None
:
parse_seq
(
ofh
,
seq_name
,
seq_seq
)
parse_seq
(
ofh
,
seq_name
,
seq_seq
,
min_size
=
min_size
)
seq_seq
=
""
seq_name
=
line
[
1
:]
...
...
@@ -65,7 +74,7 @@ def main(args):
seq_seq
+=
line
if
seq_name
is
not
None
:
parse_seq
(
ofh
,
seq_name
,
seq_seq
)
parse_seq
(
ofh
,
seq_name
,
seq_seq
,
min_size
=
min_size
)
pass
...
...
om_augmenter.py
View file @
d26ea393
...
...
@@ -72,108 +72,51 @@ def main(args):
print
"CREATING REPORT:"
,
oufile
reporter
=
open
(
oufile
,
"w"
)
linecount
=
0
data
=
[
KeyedTuple
(
x
,
labels
=
names
).
_asdict
()
for
x
in
data
]
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
QryContigIDs
=
groups
[
"RefContigID_QryContigID"
][
RefContigID
]
with
open
(
oufile
,
"w"
)
as
reporter
:
reporter
.
write
(
"
\n
"
.
join
(
headers
[:
-
2
])
+
"
\n
#
\n
"
)
reporter
.
write
(
"# FIELDS:
\n
"
)
reporter
.
write
(
"
\n
"
.
join
(
[
"# %-39s: %s"
%
(
x
,
valid_fields
[
'helps_t'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
#
\n
"
)
for
QryContigID
in
sorted
(
QryContigIDs
):
dataPoses
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
]
)
reporter
.
write
(
"#h "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
x
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
reporter
.
write
(
"#f "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
valid_fields
[
'types'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
for
x
in
dataPoses
:
if
isinstance
(
data
[
x
],
dict
):
pass
else
:
data
[
x
]
=
KeyedTuple
(
data
[
x
],
labels
=
names
).
_asdict
()
dataVals
=
[
data
[
x
]
for
x
in
dataPoses
]
ref_lens
=
[
(
x
[
"RefStartPos"
],
x
[
"RefEndPos"
]
)
for
x
in
dataVals
]
qry_lens
=
[
(
x
[
"QryStartPos"
],
x
[
"QryEndPos"
]
)
for
x
in
dataVals
]
num_qry_matches
=
len
(
groups
[
"QryContigID_RefContigID"
][
QryContigID
]
)
num_orientations
=
len
(
set
([
x
[
"Orientation"
]
for
x
in
dataVals
])
)
ref_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
ref_lens
]
)
ref_min_coord
=
min
(
[
min
(
x
)
for
x
in
ref_lens
]
)
ref_max_coord
=
max
(
[
max
(
x
)
for
x
in
ref_lens
]
)
ref_gap_len
=
ref_max_coord
-
ref_min_coord
qry_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
qry_lens
]
)
qry_min_coord
=
min
(
[
min
(
x
)
for
x
in
qry_lens
]
)
qry_max_coord
=
max
(
[
max
(
x
)
for
x
in
qry_lens
]
)
qry_gap_len
=
qry_max_coord
-
qry_min_coord
XmapEntryIDs
=
groups
[
"QryContigID_XmapEntryID"
][
QryContigID
].
keys
()
Confidences
=
[
groups
[
"XmapEntryID_Confidence"
][
x
].
keys
()[
0
]
for
x
in
XmapEntryIDs
]
max_confidence
=
max
(
Confidences
)
confidence_index
=
Confidences
.
index
(
max_confidence
)
confidence_qry
=
XmapEntryIDs
[
confidence_index
]
confidence_pos_set
=
groups
[
"XmapEntryID_Confidence"
][
confidence_qry
][
max_confidence
]
confidence_pos
=
list
(
confidence_pos_set
)[
0
]
if
isinstance
(
data
[
confidence_pos
],
dict
):
pass
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
QryContigIDs
=
groups
[
"RefContigID_QryContigID"
][
RefContigID
]
else
:
data
[
confidence_pos
]
=
KeyedTuple
(
data
[
confidence_pos
],
labels
=
names
).
_asdict
()
max_confidence_chrom
=
data
[
confidence_pos
][
"RefContigID"
]
#print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for
data_pos
in
dataPoses
:
dataVal
=
data
[
data_pos
]
cigar
=
dataVal
[
"HitEnum"
]
cigar_matches
,
cigar_insertions
,
cigar_deletions
=
process_cigar
(
cigar
)
Alignment
=
dataVal
[
"Alignment"
]
alignment_count_queries
,
alignment_count_refs
,
alignment_count_refs_colapses
,
alignment_count_queries_colapses
=
process_alignment
(
Alignment
)
dataVal
[
"_meta_alignment_count_queries"
]
=
alignment_count_queries
dataVal
[
"_meta_alignment_count_queries_colapses"
]
=
alignment_count_refs_colapses
dataVal
[
"_meta_alignment_count_refs"
]
=
alignment_count_refs
dataVal
[
"_meta_alignment_count_refs_colapses"
]
=
alignment_count_queries_colapses
dataVal
[
"_meta_cigar_deletions"
]
=
cigar_deletions
dataVal
[
"_meta_cigar_insertions"
]
=
cigar_insertions
dataVal
[
"_meta_cigar_matches"
]
=
cigar_matches
dataVal
[
"_meta_is_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
==
RefContigID
dataVal
[
"_meta_len_ref_match_gapped"
]
=
ref_gap_len
dataVal
[
"_meta_len_ref_match_no_gap"
]
=
ref_no_gap_len
dataVal
[
"_meta_len_qry_match_gapped"
]
=
qry_gap_len
dataVal
[
"_meta_len_qry_match_no_gap"
]
=
qry_no_gap_len
dataVal
[
"_meta_max_confidence_for_qry"
]
=
max_confidence
dataVal
[
"_meta_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
dataVal
[
"_meta_num_orientations"
]
=
num_orientations
dataVal
[
"_meta_num_qry_matches"
]
=
num_qry_matches
dataVal
[
"_meta_proportion_query_len_gapped"
]
=
(
qry_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_proportion_query_len_no_gap"
]
=
(
qry_no_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_proportion_sizes_gapped"
]
=
(
ref_gap_len
*
1.0
)
/
qry_gap_len
dataVal
[
"_meta_proportion_sizes_no_gap"
]
=
(
ref_no_gap_len
*
1.0
)
/
qry_no_gap_len
data
[
data_pos
]
=
dataVal
if
linecount
==
0
:
reporter
.
write
(
"
\n
"
.
join
(
headers
[:
-
2
])
+
"
\n
#
\n
"
)
reporter
.
write
(
"# FIELDS:
\n
"
)
reporter
.
write
(
"
\n
"
.
join
(
[
"# %-39s: %s"
%
(
x
,
valid_fields
[
'helps_t'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
#
\n
"
)
for
QryContigID
in
sorted
(
QryContigIDs
):
data_poses
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
])
all_data_poses
=
list
(
indexer
[
"QryContigID"
][
QryContigID
])
data_vals
=
[
data
[
x
]
for
x
in
data_poses
]
stats
=
stats_from_data_vals
(
RefContigID
,
QryContigID
,
groups
,
indexer
,
data
,
data_vals
,
all_data_poses
)
#print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for
data_val
in
data_vals
:
cigar
=
data_val
[
"HitEnum"
]
cigar_matches
,
cigar_insertions
,
cigar_deletions
=
process_cigar
(
cigar
)
reporter
.
write
(
"#h "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
x
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
reporter
.
write
(
"#f "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
valid_fields
[
'types'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter
.
write
(
"
\t
"
.
join
(
[
str
(
dataVal
[
x
])
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
Alignment
=
data_val
[
"Alignment"
]
alignment_count_queries
,
alignment_count_refs
,
alignment_count_refs_colapses
,
alignment_count_queries_colapses
=
process_alignment
(
Alignment
)
for
stat
in
stats
:
data_val
[
stat
]
=
stats
[
stat
]
data_val
[
"_meta_alignment_count_queries"
]
=
alignment_count_queries
data_val
[
"_meta_alignment_count_queries_colapses"
]
=
alignment_count_refs_colapses
data_val
[
"_meta_alignment_count_refs"
]
=
alignment_count_refs
data_val
[
"_meta_alignment_count_refs_colapses"
]
=
alignment_count_queries_colapses
linecount
+=
1
data_val
[
"_meta_cigar_deletions"
]
=
cigar_deletions
data_val
[
"_meta_cigar_insertions"
]
=
cigar_insertions
data_val
[
"_meta_cigar_matches"
]
=
cigar_matches
data_val
[
"_meta_proportion_query_len_gapped"
]
=
(
data_val
[
'_meta_len_qry_match_gapped'
]
*
1.0
)
/
data_val
[
"QryLen"
]
data_val
[
"_meta_proportion_query_len_no_gap"
]
=
(
data_val
[
'_meta_len_qry_match_no_gap'
]
*
1.0
)
/
data_val
[
"QryLen"
]
#print " ", " ".join( ["%s %s" % (x, str(data_val[x])) for x in sorted(data_val)] )
reporter
.
write
(
"
\t
"
.
join
(
[
str
(
data_val
[
x
])
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
...
...
om_filter.py
View file @
d26ea393
...
...
@@ -16,6 +16,7 @@ def parse_args(args):
return
args
def
main
(
args
):
valid_fields
=
gen_valid_fields
(
valid_fields_g
)
infile
=
args
.
infile
...
...
@@ -45,6 +46,7 @@ def main(args):
print
"saving to %s"
%
oufile
data
,
headers
,
names
,
seman
,
types
,
indexer
,
groups
,
ref_maps_from
,
query_maps_from
,
filters_csv
=
parse_file
(
infile
,
valid_fields
)
data
=
[
KeyedTuple
(
x
,
labels
=
names
).
_asdict
()
for
x
in
data
]
print
"NAMES"
,
names
#print "HEADERS", "\n".join( headers )
...
...
@@ -56,122 +58,50 @@ def main(args):
print
"CREATING REPORT:"
,
oufile
+
".report.tsv"
reporter
=
open
(
oufile
+
".report.tsv"
,
"w"
)
linecount
=
0
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
QryContigIDs
=
groups
[
"RefContigID_QryContigID"
][
RefContigID
]
with
open
(
oufile
+
".report.tsv"
,
"w"
)
as
reporter
:
reporter
.
write
(
"
\n
"
.
join
(
headers
[:
-
2
])
+
"
\n
#
\n
"
)
for
QryContigID
in
sorted
(
QryContigIDs
):
dataPoses
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
])
for
x
in
dataPoses
:
if
isinstance
(
data
[
x
],
dict
):
pass
else
:
data
[
x
]
=
KeyedTuple
(
data
[
x
],
labels
=
names
).
_asdict
()
dataVals
=
[
data
[
x
]
for
x
in
dataPoses
]
reporter
.
write
(
"# FILTERS:
\n
"
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value_str
,
field_value
in
filters
:
reporter
.
write
(
"# FILTER : %-39s: %3s : %s
\n
"
%
(
field_name
,
field_operator_name
,
field_value_str
)
)
reporter
.
write
(
"
\n\n
"
)
reporter
.
write
(
"#h "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
x
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
reporter
.
write
(
"#f "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
valid_fields
[
'types'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
ref_lens
=
[
(
x
[
"RefStartPos"
],
x
[
"RefEndPos"
]
)
for
x
in
dataVals
]
qry_lens
=
[
(
x
[
"QryStartPos"
],
x
[
"QryEndPos"
]
)
for
x
in
dataVals
]
num_qry_matches
=
len
(
groups
[
"QryContigID_RefContigID"
][
QryContigID
]
)
num_orientations
=
len
(
set
([
x
[
"Orientation"
]
for
x
in
dataVals
])
)
ref_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
ref_lens
]
)
ref_min_coord
=
min
(
[
min
(
x
)
for
x
in
ref_lens
]
)
ref_max_coord
=
max
(
[
max
(
x
)
for
x
in
ref_lens
]
)
ref_gap_len
=
ref_max_coord
-
ref_min_coord
qry_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
qry_lens
]
)
qry_min_coord
=
min
(
[
min
(
x
)
for
x
in
qry_lens
]
)
qry_max_coord
=
max
(
[
max
(
x
)
for
x
in
qry_lens
]
)
qry_gap_len
=
qry_max_coord
-
qry_min_coord
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
QryContigIDs
=
groups
[
"RefContigID_QryContigID"
][
RefContigID
]
XmapEntryIDs
=
groups
[
"QryContigID_XmapEntryID"
][
QryContigID
].
keys
()
Confidences
=
[
groups
[
"XmapEntryID_Confidence"
][
x
].
keys
()[
0
]
for
x
in
XmapEntryIDs
]
max_confidence
=
max
(
Confidences
)
confidence_index
=
Confidences
.
index
(
max_confidence
)
confidence_qry
=
XmapEntryIDs
[
confidence_index
]
confidence_pos_set
=
groups
[
"XmapEntryID_Confidence"
][
confidence_qry
][
max_confidence
]
confidence_pos
=
list
(
confidence_pos_set
)[
0
]
if
isinstance
(
data
[
confidence_pos
],
dict
):
pass
else
:
data
[
confidence_pos
]
=
KeyedTuple
(
data
[
confidence_pos
],
labels
=
names
).
_asdict
()
max_confidence_chrom
=
data
[
confidence_pos
][
"RefContigID"
]
#print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for
data_pos
in
dataPoses
:
dataVal
=
data
[
data_pos
]
cigar
=
dataVal
[
"HitEnum"
]
cigar_matches
,
cigar_insertions
,
cigar_deletions
=
process_cigar
(
cigar
)
for
QryContigID
in
sorted
(
QryContigIDs
):
data_poses
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
])
data_vals
=
[]
valid_data_poses
=
[]
Alignment
=
dataVal
[
"Alignment"
]
alignment_count_queries
,
alignment_count_refs
,
alignment_count_refs_colapses
,
alignment_count_queries_colapses
=
process_alignment
(
Alignment
)
dataVal
[
"_meta_alignment_count_queries"
]
=
alignment_count_queries
dataVal
[
"_meta_alignment_count_queries_colapses"
]
=
alignment_count_refs_colapses
dataVal
[
"_meta_alignment_count_refs"
]
=
alignment_count_refs
dataVal
[
"_meta_alignment_count_refs_colapses"
]
=
alignment_count_queries_colapses
dataVal
[
"_meta_cigar_deletions"
]
=
cigar_deletions
dataVal
[
"_meta_cigar_insertions"
]
=
cigar_insertions
dataVal
[
"_meta_cigar_matches"
]
=
cigar_matches
dataVal
[
"_meta_is_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
==
RefContigID
dataVal
[
"_meta_len_ref_match_gapped"
]
=
ref_gap_len
dataVal
[
"_meta_len_ref_match_no_gap"
]
=
ref_no_gap_len
dataVal
[
"_meta_len_qry_match_gapped"
]
=
qry_gap_len
dataVal
[
"_meta_len_qry_match_no_gap"
]
=
qry_no_gap_len
dataVal
[
"_meta_max_confidence_for_qry"
]
=
max_confidence
dataVal
[
"_meta_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
all_data_poses
=
list
(
indexer
[
"QryContigID"
][
QryContigID
])
for
data_pos
in
list
(
all_data_poses
):
data_val
=
data
[
data_pos
]
filter_res
=
all
([
field_operator
(
data_val
[
field_name
],
field_value
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value_str
,
field_value
in
filters
])
if
filter_res
:
if
data_pos
in
data_poses
:
data_vals
.
append
(
data_val
)
valid_data_poses
.
append
(
data_pos
)
if
len
(
data_vals
)
>
0
:
stats
=
stats_from_data_vals
(
RefContigID
,
QryContigID
,
groups
,
indexer
,
data
,
data_vals
,
valid_data_poses
)
#print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for
data_val
in
data_vals
:
for
stat
in
stats
:
data_val
[
stat
]
=
stats
[
stat
]
dataVal
[
"_meta_num_orientations"
]
=
num_orientations
dataVal
[
"_meta_num_qry_matches"
]
=
num_qry_matches
filter_res
=
all
([
field_operator
(
data_val
[
field_name
],
field_value
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value_str
,
field_value
in
filters
])
if
not
filter_res
:
continue
dataVal
[
"_meta_proportion_query_len_gapped"
]
=
(
qry_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_proportion_query_len_no_gap"
]
=
(
qry_no_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_proportion_sizes_gapped"
]
=
(
ref_gap_len
*
1.0
)
/
qry_gap_len
dataVal
[
"_meta_proportion_sizes_no_gap"
]
=
(
ref_no_gap_len
*
1.0
)
/
qry_no_gap_len
data
[
data_pos
]
=
dataVal
#print filters
#for field_name, field_operator_name, field_operator, field_value_str, field_value in filters:
# print "filter", field_name, field_operator_name, field_operator, field_value_str, field_value, "val", dataVal[field_name]
# print " ", field_operator( dataVal[field_name], field_value )
filter_res
=
all
([
field_operator
(
dataVal
[
field_name
],
field_value
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value_str
,
field_value
in
filters
])
if
not
filter_res
:
continue
if
linecount
==
0
:
reporter
.
write
(
"
\n
"
.
join
(
headers
[:
-
2
])
+
"
\n
#
\n
"
)
reporter
.
write
(
"# FILTERS:
\n
"
)
for
field_name
,
field_operator_name
,
field_operator
,
field_value_str
,
field_value
in
filters
:
reporter
.
write
(
"# FILTER : %-39s: %3s : %s
\n
"
%
(
field_name
,
field_operator_name
,
field_value_str
)
)
reporter
.
write
(
"
\n\n
"
)
reporter
.
write
(
"#h "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
x
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
reporter
.
write
(
"#f "
+
"
\t
"
.
join
(
[
"%-39s"
%
(
valid_fields
[
'types'
][
x
]
)
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
#print " ", " ".join( ["%s %s" % (x, str(dataVal[x])) for x in sorted(dataVal)] )
reporter
.
write
(
"
\t
"
.
join
(
[
str
(
dataVal
[
x
])
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
linecount
+=
1
#print " ", " ".join( ["%s %s" % (x, str(data_val[x])) for x in sorted(data_val)] )
reporter
.
write
(
"
\t
"
.
join
(
[
str
(
data_val
[
x
])
for
x
in
valid_fields
[
'names'
]
]
)
+
"
\n
"
)
print
...
...
om_shared.py
View file @
d26ea393
...
...
@@ -11,7 +11,7 @@ from sqlalchemy.util import KeyedTuple
1 141 1 528400.6 571697.5 10672 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 "(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
"""
cols_to_index
=
[
"QryContigID"
,
"RefContigID"
,
"Orientation"
]
cols_to_index
=
[
"QryContigID"
,
"RefContigID"
,
"Orientation"
,
"XmapEntryID"
]
group_by
=
[
[
"RefContigID"
,
"QryContigID"
],
[
"RefContigID"
,
"RefStartPos"
],
...
...
@@ -281,6 +281,61 @@ def parse_file(infile, valid_fields):
def
stats_from_data_vals
(
RefContigID
,
QryContigID
,
groups
,
indexer
,
data
,
data_vals
,
valid_data_poses
):
ref_lens
=
[
(
x
[
"RefStartPos"
],
x
[
"RefEndPos"
]
)
for
x
in
data_vals
]
qry_lens
=
[
(
x
[
"QryStartPos"
],
x
[
"QryEndPos"
]
)
for
x
in
data_vals
]
num_qry_matches
=
[]
for
RefContigID_l
in
groups
[
"QryContigID_RefContigID"
][
QryContigID
]:
for
match_pos
in
groups
[
"QryContigID_RefContigID"
][
QryContigID
][
RefContigID_l
]:
if
match_pos
in
valid_data_poses
:
num_qry_matches
.
append
(
RefContigID_l
)
#num_qry_matches = len( groups["QryContigID_RefContigID"][QryContigID] )
num_qry_matches
=
len
(
set
(
num_qry_matches
)
)
num_orientations
=
len
(
set
([
x
[
"Orientation"
]
for
x
in
data_vals
])
)
ref_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
ref_lens
]
)
ref_min_coord
=
min
(
[
min
(
x
)
for
x
in
ref_lens
]
)
ref_max_coord
=
max
(
[
max
(
x
)
for
x
in
ref_lens
]
)
ref_gap_len
=
ref_max_coord
-
ref_min_coord
qry_no_gap_len
=
sum
(
[
max
(
x
)
-
min
(
x
)
for
x
in
qry_lens
]
)
qry_min_coord
=
min
(
[
min
(
x
)
for
x
in
qry_lens
]
)
qry_max_coord
=
max
(
[
max
(
x
)
for
x
in
qry_lens
]
)
qry_gap_len
=
qry_max_coord
-
qry_min_coord
XmapEntryIDs
=
groups
[
"QryContigID_XmapEntryID"
][
QryContigID
].
keys
()
Confidences
=
[]
for
XmapEntryID
in
XmapEntryIDs
:
data_pos
=
list
(
indexer
[
"XmapEntryID"
][
XmapEntryID
])[
0
]
if
data_pos
not
in
valid_data_poses
:
continue
Confidences
.
append
(
[
data
[
data_pos
][
"Confidence"
],
data
[
data_pos
][
"RefContigID"
]
]
)
max_confidence
=
max
([
x
[
0
]
for
x
in
Confidences
])
max_confidence_chrom
=
[
x
[
1
]
for
x
in
Confidences
if
x
[
0
]
==
max_confidence
][
0
]
stats
=
{}
stats
[
"_meta_is_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
==
RefContigID
stats
[
"_meta_len_ref_match_gapped"
]
=
ref_gap_len
stats
[
"_meta_len_ref_match_no_gap"
]
=
ref_no_gap_len
stats
[
"_meta_len_qry_match_gapped"
]
=
qry_gap_len
stats
[
"_meta_len_qry_match_no_gap"
]
=
qry_no_gap_len
stats
[
"_meta_max_confidence_for_qry"
]
=
max_confidence
stats
[
"_meta_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
stats
[
"_meta_num_orientations"
]
=
num_orientations
stats
[
"_meta_num_qry_matches"
]
=
num_qry_matches
stats
[
"_meta_qry_matches"
]
=
','
.
join
(
[
str
(
x
)
for
x
in
sorted
(
list
(
set
([
x
[
1
]
for
x
in
Confidences
])))
]
)
stats
[
"_meta_proportion_sizes_gapped"
]
=
(
ref_gap_len
*
1.0
)
/
qry_gap_len
stats
[
"_meta_proportion_sizes_no_gap"
]
=
(
ref_no_gap_len
*
1.0
)
/
qry_no_gap_len
return
stats
valid_fields_g
=
{
'data'
:
...
...
@@ -324,7 +379,8 @@ valid_fields_g = {
[
"_meta_max_confidence_for_qry_chrom"
,
'float'
,
float
,
'Which RefContigID is the highest confidence for this QryContigID'
],
[
"_meta_num_orientations"
,
'int'
,
int
,
'Number of orientations for this QryContigID'
],
[
"_meta_num_qry_matches"
,
'int'
,
int
,
'Number of matches for this QryContigID'
],
[
"_meta_num_qry_matches"
,
'int'
,
int
,
'Number of RefContigID matches for this QryContigID'
],
[
"_meta_qry_matches"
,
'string'
,
str
,
'Which chromosomes this QryContigID matches'
],
[
"_meta_proportion_query_len_gapped"
,
'float'
,
float
,
'_meta_len_qry_match_gapped / QryLen'
],
[
"_meta_proportion_query_len_no_gap"
,
'float'
,
float
,
'_meta_len_qry_match_no_gap / QryLen'
],
...
...
run.sh
View file @
d26ea393
...
...
@@ -9,30 +9,34 @@ gff=$filtered.gff3
cols_to_escape
=
info/gff_cols_to_escape.txt
chromosome_names
=
info/S_lycopersicum_chromosomes.2.50.chromosome_names.txt
#
rm $augmented || true
rm
$augmented
||
true
if
[[
!
-f
"
$augmented
"
]]
;
then
./om_augmenter.py
$infile
-g
-c
fi
#
rm $filtered || true
rm
$filtered
||
true
if
[[
!
-f
"
$filtered
"
]]
;
then
./om_filter.py
$augmented
--filter
Confidence:ge:10
--filter
_meta_num_orientations:gt:1
--filter
_meta_is_max_confidence_for_qry_chrom:eq:T
fi
exit
0
#rm $delta || true
if
[[
!
-f
"
$delta
"
]]
;
then
./om_to_delta.py
$filtered
fi
rm
$gff
||
true
#
rm $gff || true
if
[[
!
-f
"
$gff
"
]]
;
then
./om_to_gff.py
--names-from-file
$chromosome_names
--exclude-cols-from-file
$cols_to_escape
$filtered
fi
./om_to_gff.py
--names-from-file
$chromosome_names
--exclude-cols-from-file
$cols_to_escape
${
augmented
}
./om_filter.py
$augmented
--filter
Confidence:ge:10
./om_to_gff.py
--names-from-file
$chromosome_names
--exclude-cols-from-file
$cols_to_escape
${
augmented
}
_Confidence_ge_10.report.tsv
#./om_filter.py $augmented --filter Confidence:ge:10
#./om_filter.py $augmented --filter Confidence:ge:10 --filter _meta_num_qry_matches:gt:1
#./om_to_gff.py ${augmented} --names-from-file $chromosome_names --exclude-cols-from-file $cols_to_escape
#./om_to_gff.py ${augmented}_Confidence_ge_10.report.tsv --names-from-file $chromosome_names --exclude-cols-from-file $cols_to_escape
./om_to_delta.py
${
augmented
}
_Confidence_ge_10__meta_num_qry_matches_gt_1.report.tsv
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment