Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Aflitos, Saulo Alves
opticalmapping
Commits
d13faf95
Commit
d13faf95
authored
Mar 13, 2015
by
sauloal
Browse files
first
parents
Changes
4
Hide whitespace changes
Inline
Side-by-side
.gitignore
0 → 100644
View file @
d13faf95
*.txt
*.delta
*.tsv
README.md
0 → 100644
View file @
d13faf95
Parser of Bionano Genomics Optical Mapping XMap
mummer.sh
0 → 100755
View file @
d13faf95
~assembly/tomato150/programs/mummer/MUMmer3.23/mummerplot S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt.delta
om_parser.py
0 → 100755
View file @
d13faf95
#!/usr/bin/python
import
os
import
sys
from
collections
import
defaultdict
from
sqlalchemy.util
import
KeyedTuple
"""
#h XmapEntryID QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence HitEnum QryLen RefLen LabelChannel Alignment
#f int int int float float float float string float string float float int string
1 141 1 528400.6 571697.5 10672 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 "(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
"""
PLUS
=
0
MINUS
=
1
def
col_parse_orientation
(
val
):
assert
(
val
in
(
"+"
,
"-"
))
return
val
#if val == "+":
# return PLUS
#
#elif val == "-":
# return MINUS
def
col_parse_hit_enum
(
val
):
#4M2D2M
return
val
def
col_parse_alignment
(
val
):
#"(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
val
=
val
.
strip
(
'"'
)
return
val
col_parsers
=
{
"Orientation"
:
col_parse_orientation
,
"HitEnum"
:
col_parse_hit_enum
,
"Alignment"
:
col_parse_alignment
}
min_confidence
=
10
cols_to_index
=
[
"QryContigID"
,
"RefContigID"
,
"Orientation"
]
group_by
=
[
[
"RefContigID"
,
"QryContigID"
],
[
"RefContigID"
,
"RefStartPos"
],
[
"QryContigID"
,
"RefContigID"
],
[
"QryContigID"
,
"Orientation"
],
[
"QryContigID"
,
"XmapEntryID"
],
[
"XmapEntryID"
,
"Confidence"
],
]
def
parse_file
(
infile
):
data
=
[]
names
=
[]
seman
=
{}
types
=
[]
indexer
=
{}
groups
=
defaultdict
(
lambda
:
defaultdict
(
lambda
:
defaultdict
(
set
)))
for
cti
in
cols_to_index
:
indexer
[
cti
]
=
defaultdict
(
set
)
with
open
(
infile
,
'r'
)
as
fhd
:
for
line
in
fhd
:
line
=
line
.
strip
()
if
len
(
line
)
==
0
:
continue
if
line
[
0
]
==
"#"
:
if
line
[
1
]
==
"h"
:
line
=
line
[
3
:]
names
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)]
#print "NAMES", names
for
p
,
n
in
enumerate
(
names
):
seman
[
n
]
=
p
elif
line
[
1
]
==
"f"
:
line
=
line
[
3
:]
types
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)]
for
tp
in
xrange
(
len
(
types
)):
t
=
types
[
tp
]
if
t
==
"int"
:
types
[
tp
]
=
int
elif
t
==
"float"
:
types
[
tp
]
=
float
elif
t
==
"string"
:
types
[
tp
]
=
col_parsers
[
names
[
tp
]
]
assert
(
len
(
types
)
==
len
(
names
))
#print "TYPES", types
continue
cols
=
[
x
.
strip
()
for
x
in
line
.
split
(
"
\t
"
)
]
vals
=
[
types
[
p
](
cols
[
p
])
for
p
in
xrange
(
len
(
cols
))]
for
ind
in
indexer
:
indexer
[
ind
][
vals
[
seman
[
ind
]]
].
add
(
len
(
data
))
for
grp_from
,
grp_to
in
group_by
:
val_from
=
vals
[
seman
[
grp_from
]]
val_to
=
vals
[
seman
[
grp_to
]]
groups
[
grp_from
+
'_'
+
grp_to
][
val_from
][
val_to
].
add
(
len
(
data
))
data
.
append
(
vals
)
return
data
,
names
,
seman
,
types
,
indexer
,
groups
def
main
(
args
):
infile
=
args
[
0
]
MIN_CONFIDENCE
=
0
data
,
names
,
seman
,
types
,
indexer
,
groups
=
parse_file
(
infile
)
print
"NAMES"
,
names
print
"TYPES"
,
types
#print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]]
print
"DATA"
,
data
[
1
]
print
"file has %5d maps and %3d chromosomes"
%
(
len
(
indexer
[
"QryContigID"
]),
len
(
indexer
[
"RefContigID"
]))
if
True
:
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
print
"chromosome %2d has %4d maps"
%
(
RefContigID
,
len
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
]))
if
False
:
for
QryContigID
in
sorted
(
groups
[
"QryContigID_RefContigID"
]):
print
"query %5d maps to %2d chromosomes"
%
(
QryContigID
,
len
(
groups
[
"QryContigID_RefContigID"
][
QryContigID
]))
XmapEntryIDs
=
groups
[
"QryContigID_XmapEntryID"
][
QryContigID
].
keys
()
Confidences
=
[
groups
[
"XmapEntryID_Confidence"
][
x
].
keys
()[
0
]
for
x
in
XmapEntryIDs
]
print
" confidences "
,
Confidences
max_confidence
=
max
(
Confidences
)
print
" max confidence "
,
max_confidence
print
" max confidence chrom"
,
data
[
list
(
groups
[
"XmapEntryID_Confidence"
][
XmapEntryIDs
[
Confidences
.
index
(
max_confidence
)]][
max_confidence
])[
0
]][
seman
[
"RefContigID"
]]
if
False
:
with
open
(
infile
+
".report.tsv"
,
"w"
)
as
fhd
:
linecount
=
0
for
RefContigID
in
sorted
(
groups
[
"RefContigID_QryContigID"
]):
QryContigIDs
=
groups
[
"RefContigID_QryContigID"
][
RefContigID
]
for
QryContigID
in
sorted
(
QryContigIDs
):
dataPoses
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
])
dataVals
=
[
KeyedTuple
(
data
[
x
],
labels
=
names
).
_asdict
()
for
x
in
dataPoses
]
num_orientations
=
len
(
set
([
x
[
"Orientation"
]
for
x
in
dataVals
]))
num_qry_matches
=
len
(
groups
[
"QryContigID_RefContigID"
][
QryContigID
])
num_good_quals
=
len
(
set
([
x
[
"Confidence"
]
for
x
in
dataVals
if
x
[
"Confidence"
]
>=
MIN_CONFIDENCE
]))
#if num_orientations != 1:
#if num_qry_matches <= 3:
#if num_good_quals >= 1:
print
"RefContigID %4d QryContigID %6d"
%
(
RefContigID
,
QryContigID
)
for
dataVal
in
dataVals
:
ref_lens
=
[(
x
[
"RefStartPos"
],
x
[
"RefEndPos"
])
for
x
in
dataVals
]
ref_no_gap_len
=
sum
([
max
(
x
)
-
min
(
x
)
for
x
in
ref_lens
])
ref_min_coord
=
min
([
min
(
x
)
for
x
in
ref_lens
])
ref_max_coord
=
max
([
max
(
x
)
for
x
in
ref_lens
])
ref_gap_len
=
ref_max_coord
-
ref_min_coord
qry_lens
=
[(
x
[
"QryStartPos"
],
x
[
"QryEndPos"
])
for
x
in
dataVals
]
qry_no_gap_len
=
sum
([
max
(
x
)
-
min
(
x
)
for
x
in
qry_lens
])
qry_min_coord
=
min
([
min
(
x
)
for
x
in
qry_lens
])
qry_max_coord
=
max
([
max
(
x
)
for
x
in
qry_lens
])
qry_gap_len
=
qry_max_coord
-
qry_min_coord
XmapEntryIDs
=
groups
[
"QryContigID_XmapEntryID"
][
QryContigID
].
keys
()
Confidences
=
[
groups
[
"XmapEntryID_Confidence"
][
x
].
keys
()[
0
]
for
x
in
XmapEntryIDs
]
max_confidence
=
max
(
Confidences
)
max_confidence_chrom
=
data
[
list
(
groups
[
"XmapEntryID_Confidence"
][
XmapEntryIDs
[
Confidences
.
index
(
max_confidence
)]][
max_confidence
])[
0
]][
seman
[
"RefContigID"
]]
dataVal
[
"_meta_num_orientations"
]
=
num_orientations
dataVal
[
"_meta_num_qry_matches"
]
=
num_qry_matches
dataVal
[
"_meta_num_good_confidence_%d"
%
MIN_CONFIDENCE
]
=
num_good_quals
dataVal
[
"_meta_len_ref_match_no_gap"
]
=
ref_no_gap_len
dataVal
[
"_meta_len_ref_match_gapped"
]
=
ref_gap_len
dataVal
[
"_meta_len_qry_match_no_gap"
]
=
qry_no_gap_len
dataVal
[
"_meta_len_qry_match_gapped"
]
=
qry_gap_len
dataVal
[
"_meta_proportion_sizes_no_gap"
]
=
(
ref_no_gap_len
*
1.0
)
/
qry_no_gap_len
dataVal
[
"_meta_proportion_sizes_gapped"
]
=
(
ref_gap_len
*
1.0
)
/
qry_gap_len
dataVal
[
"_meta_proportion_query_len_no_gap"
]
=
(
qry_no_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_proportion_query_len_gapped"
]
=
(
qry_gap_len
*
1.0
)
/
dataVal
[
"QryLen"
]
dataVal
[
"_meta_max_confidence_for_qry"
]
=
max_confidence
dataVal
[
"_meta_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
dataVal
[
"_meta_is_max_confidence_for_qry_chrom"
]
=
max_confidence_chrom
==
RefContigID
if
linecount
==
0
:
fhd
.
write
(
"
\t
"
.
join
(
sorted
(
dataVal
))
+
"
\n
"
)
print
" "
,
" "
.
join
(
[
"%s %s"
%
(
x
,
str
(
dataVal
[
x
]))
for
x
in
sorted
(
dataVal
)]
)
fhd
.
write
(
"
\t
"
.
join
(
[
str
(
dataVal
[
x
])
for
x
in
sorted
(
dataVal
)])
+
"
\n
"
)
linecount
+=
1
print
if
True
:
with
open
(
infile
+
".delta"
,
"w"
)
as
fhd
:
linecount
=
0
fhd
.
write
(
"/home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_lycopersicum_heinz/SL2.40ch12.fa /home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_pennellii_scaffold/final.assembly.fasta
\n
"
)
fhd
.
write
(
"NUCMER
\n
"
)
sum_ref_len
=
0
sum_qry_len
=
0
done_QryContigID
=
{}
for
RefContigID
in
sorted
(
groups
[
"RefContigID_RefStartPos"
]):
RefStartPoses
=
groups
[
"RefContigID_RefStartPos"
][
RefContigID
]
for
RefStartPosG
in
sorted
(
RefStartPoses
):
pos_rows
=
list
(
RefStartPoses
[
RefStartPosG
])
for
pos_row_pos
in
pos_rows
:
pos_row
=
data
[
pos_row_pos
]
pos_row
=
KeyedTuple
(
pos_row
,
labels
=
names
).
_asdict
()
Confidence_row
=
pos_row
[
"Confidence"
]
if
(
Confidence_row
<
MIN_CONFIDENCE
):
continue
QryContigID
=
pos_row
[
"QryContigID"
]
if
QryContigID
not
in
done_QryContigID
:
done_QryContigID
[
QryContigID
]
=
{}
first_time
=
True
if
RefContigID
in
done_QryContigID
[
QryContigID
]:
continue
else
:
if
len
(
done_QryContigID
[
QryContigID
])
==
0
:
done_QryContigID
[
QryContigID
][
RefContigID
]
=
sum_qry_len
else
:
done_QryContigID
[
QryContigID
][
RefContigID
]
=
done_QryContigID
[
QryContigID
][
done_QryContigID
[
QryContigID
].
keys
()[
0
]]
first_time
=
False
sum_qry_len_local
=
done_QryContigID
[
QryContigID
][
RefContigID
]
line_count
=
0
qry_rows
=
list
(
groups
[
"RefContigID_QryContigID"
][
RefContigID
][
QryContigID
])
for
qry_row_pos
in
qry_rows
:
qry_row
=
data
[
qry_row_pos
]
qry_row
=
KeyedTuple
(
qry_row
,
labels
=
names
).
_asdict
()
Confidence_qry
=
qry_row
[
"Confidence"
]
if
(
Confidence_qry
<
MIN_CONFIDENCE
):
continue
RefStartPos
=
qry_row
[
"RefStartPos"
]
+
sum_ref_len
RefEndPos
=
qry_row
[
"RefEndPos"
]
+
sum_ref_len
RefLen
=
qry_row
[
"RefLen"
]
QryStartPos
=
qry_row
[
"QryStartPos"
]
+
sum_qry_len_local
QryEndPos
=
qry_row
[
"QryEndPos"
]
+
sum_qry_len_local
QryLen
=
qry_row
[
"QryLen"
]
num_errors
=
0
sim_erros
=
0
stop_codons
=
0
header
=
[
int
(
RefContigID
),
int
(
QryContigID
),
int
(
RefLen
),
int
(
QryLen
)
]
line
=
[
int
(
RefStartPos
),
int
(
RefEndPos
),
int
(
QryStartPos
),
int
(
QryEndPos
),
num_errors
,
sim_erros
,
stop_codons
]
if
line_count
==
0
:
fhd
.
write
(
">"
+
" "
.
join
([
str
(
x
)
for
x
in
header
])
+
"
\n
"
)
fhd
.
write
(
" "
.
join
([
str
(
x
)
for
x
in
line
])
+
"
\n
0
\n
"
)
line_count
+=
1
if
first_time
:
sum_qry_len
+=
QryLen
sum_ref_len
+=
RefLen
#break
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
==
1
:
print
"no arguments given"
sys
.
exit
(
1
)
main
(
sys
.
argv
[
1
:])
"""
# $ cd D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports; C:\Program Files\BioNano Genomics\RefAligner\WindowsRefAligner.exe -f -ref D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\S_lycopersicum_chromosomes.2.50.BspQI-BbvCI.cmap -i D:\Plextor\data\Acquisitie\BioNanoGenomics\MyLycopersicumWorkspace_31022015\Imports\EXP_REFINEFINAL1.cmap -o S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1 -endoutlier 1e-2 -outlier 1e-4 -extend 1 -FN 0.08 -FP 0.8 -sf 0.2 -sd 0 -sr 0.02 -res 2.9 -resSD 0.7 -mres 2.0 -A 5 -biaswt 0 -M 1 -Mfast 0 -maxmem 2 -T 1e-6 -stdout -stderr
# r3498 $Header: http://svn.bnm.local:81/svn/informatics/RefAligner/branches/3480/RefAligner.cpp 3470 2014-12-17 19:29:21Z tanantharaman $
# FLAGS: USE_SSE=0 USE_AVX=0 USE_MIC=0 USE_PFLOAT=1 USE_RFLOAT=1 DEBUG=1 VERB=1
# XMAP File Version: 0.2
# Label Channels: 1
# Reference Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_r.cmap
# Query Maps From: S_lycopersicum_chromosomes.2.50.BspQI-BbvCI_to_EXP_REFINEFINAL1_q.cmap
#h XmapEntryID QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence HitEnum QryLen RefLen LabelChannel Alignment
#f int int int float float float float string float string float float int string
1 141 1 528400.6 571697.5 10672 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 "(1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)"
2 174 1 21236.5 1568390 10672 1553561 + 79.35 2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D1M1D6M1D1M1D1M2D2M2D1M1I1D1M1D5M2D4M2D1M2D2M1D2M1D3M1D1M1D2M3I3D1M1D1M3D2M3D1M2I1D1M2D1M1D1M1I2D3M2I1M1D2M1D1M1D1M2I3D3M3D1M2D1M1D1M1D5M2D12M 1568410 21805821 1 "(1,2)(2,2)(3,3)(6,4)(7,4)(9,5)(11,6)(12,7)(13,8)(14,9)(15,11)(16,12)(18,13)(19,14)(20,15)(21,15)(24,18)(25,19)(26,20)(27,21)(28,22)(29,23)(30,24)(31,25)(32,26)(33,30)(34,31)(35,32)(37,33)(38,34)(39,35)(40,36)(41,37)(42,38)(44,39)(45,40)(47,41)(48,41)(50,42)(51,43)(52,44)(53,45)(54,46)(55,47)(57,48)(59,49)(60,50)(62,50)(63,51)(66,52)(68,54)(69,55)(70,55)(71,56)(72,57)(73,58)(74,59)(76,60)(77,60)(78,61)(79,62)(80,63)(82,64)(83,64)(86,65)(87,66)(89,67)(90,68)(92,69)(93,70)(94,71)(95,72)(96,72)(98,73)(99,74)(103,78)(105,79)(109,80)(110,81)(111,82)(114,82)(116,85)(119,86)(120,87)(121,87)(124,89)(125,90)(126,91)(127,94)(128,95)(129,95)(130,96)(132,97)(134,98)(138,101)(139,102)(140,103)(143,104)(144,104)(146,105)(147,105)(149,106)(151,107)(152,108)(153,109)(154,110)(155,111)(158,112)(159,113)(160,114)(161,115)(162,116)(163,117)(164,118)(165,119)(166,120)(167,121)(168,122)(169,123)"
"""
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment