Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Aflitos, Saulo Alves
opticalmapping
Commits
e3d73b88
Commit
e3d73b88
authored
Mar 17, 2015
by
sauloal
Browse files
agp 2 gff
parent
cb154428
Changes
3
Hide whitespace changes
Inline
Side-by-side
agp_to_gff.py
0 → 100755
View file @
e3d73b88
#!/usr/bin/python
import
os
import
sys
from
sqlalchemy.util
import
KeyedTuple
flags_gap
=
(
'N'
,
'U'
)
col_names_shared
=
[
'object_id'
,
'object_beg'
,
'object_end'
,
'part_number'
,
'component_type'
]
col_names_gap_Y
=
col_names_shared
+
[
'gap_length'
,
'gap_type'
,
'linkage'
,
'linkage_evidence'
]
col_names_gap_N
=
col_names_shared
+
[
'gap_length'
,
'gap_type'
,
'linkage'
]
col_names_contig
=
col_names_shared
+
[
'component_id'
,
'component_beg'
,
'component_end'
,
'orientation'
]
component_types
=
{
'A'
:
'Active Finishing'
,
'D'
:
'Draft HTG (often phase1 and phase2 are called Draft, whether or not they have the draft keyword)'
,
'F'
:
'Finished HTG (phase3)'
,
'G'
:
'Whole Genome Finishing'
,
'O'
:
'Other sequence (typically means no HTG keyword)'
,
'P'
:
'Pre Draft'
,
'W'
:
'WGS contig'
,
'N'
:
'gap with specified size'
,
'U'
:
'gap of unknown size, defaulting to 100 bases'
}
type_mapper
=
{
'A'
:
'contig'
,
'D'
:
'contig'
,
'F'
:
'scaffold'
,
'G'
:
'scaffold'
,
'O'
:
'contig'
,
'P'
:
'contig'
,
'W'
:
'contig'
,
'N'
:
'gap'
,
'U'
:
'gap'
}
gff_cols
=
[
'object_id'
,
'source_name'
,
'type'
,
'object_beg'
,
'object_end'
,
'score'
,
'orientation'
,
'phase'
]
source_name
=
'agp'
def
main
(
args
):
in_agp
=
args
[
0
]
ou_gff
=
in_agp
+
".gff3"
print
"SAVING TO"
,
ou_gff
with
open
(
in_agp
,
'r'
)
as
fhd_in
:
with
open
(
ou_gff
,
'w'
)
as
fhd_ou
:
fhd_ou
.
write
(
"##gff-version 3
\n
"
)
fhd_ou
.
write
(
"#infile: %s
\n
"
%
in_agp
)
for
line_in
in
fhd_in
:
line_in
=
line_in
.
strip
()
if
len
(
line_in
)
==
0
:
continue
if
line_in
[
0
]
==
"#"
:
continue
cols_in
=
line_in
.
split
(
"
\t
"
)
if
cols_in
[
4
]
in
flags_gap
:
if
cols_in
[
7
]
==
'no'
:
assert
len
(
cols_in
)
==
len
(
col_names_gap_N
),
"%d vs %d: %s"
%
(
len
(
cols_in
),
len
(
col_names_gap_N
),
" "
.
join
(
cols_in
)
)
cols_in
=
KeyedTuple
(
cols_in
,
labels
=
col_names_gap_N
).
_asdict
()
elif
cols_in
[
7
]
==
'yes'
:
assert
len
(
cols_in
)
==
len
(
col_names_gap_Y
),
"%d vs %d: %s"
%
(
len
(
cols_in
),
len
(
col_names_gap_Y
),
" "
.
join
(
cols_in
)
)
cols_in
=
KeyedTuple
(
cols_in
,
labels
=
col_names_gap_Y
).
_asdict
()
else
:
print
"unknown gap linkage evidence:"
,
cols_in
[
7
]
sys
.
exit
(
1
)
cols_in
[
'orientation'
]
=
'.'
else
:
assert
len
(
cols_in
)
==
len
(
col_names_contig
),
"%d vs %d: %s"
%
(
len
(
cols_in
),
len
(
col_names_contig
),
" "
.
join
(
cols_in
)
)
cols_in
=
KeyedTuple
(
cols_in
,
labels
=
col_names_contig
).
_asdict
()
cols_in
[
'component_type_desc'
]
=
component_types
[
cols_in
[
'component_type'
]]
cols_in
[
'type'
]
=
type_mapper
[
cols_in
[
'component_type'
]]
cols_in
[
'source_name'
]
=
source_name
cols_in
[
'score'
]
=
'.'
cols_in
[
'phase'
]
=
'.'
print
cols_in
row_id
=
cols_in
[
'object_id'
]
+
'_'
+
cols_in
[
'part_number'
]
fhd_ou
.
write
(
"
\t
"
.
join
(
[
cols_in
[
x
]
for
x
in
gff_cols
]
)
)
fhd_ou
.
write
(
"
\t
ID=%s;Name=%s;"
%
(
row_id
,
row_id
)
)
fhd_ou
.
write
(
";"
.
join
(
[
"="
.
join
([
x
,
cols_in
[
x
]])
for
x
in
sorted
(
cols_in
)
if
x
not
in
gff_cols
]
)
)
fhd_ou
.
write
(
"
\n
"
)
#1 2 3 4 5 6 7 8 9
#SL2.50ch00 1 2191949 1 W SL2.40sc05082 1 2191949 0
#SL2.50ch00 2191950 2192049 2 U 100 contig no
#SL2.50ch00 21801621 21801720 6260 U 100 contig no
#SL2.50ch00 21801721 21803721 6261 W SL2.40sc03931 1 2001 0
#SL2.50ch00 21803722 21803821 6262 U 100 contig no
#SL2.50ch00 21803822 21805821 6263 W SL2.40sc04627 1 2000 0
#SL2.50ch01 1 32987597 1 F SL2.40sc04133 1 32987597 -
#SL2.50ch01 32987598 35267597 2 N 2280000 contig no
#SL2.50ch01 35267598 36990194 3 F SL2.40sc04191 1 1722597 +
#http://www.ncbi.nlm.nih.gov/projects/genome/assembly/agp/AGP_Specification.shtml
#1 object This is the identifier for the object being assembled. This can be a
#chromosome, scaffold or contig. If an accession.version identifier is not used
#to describe the object the naming convention is to precede chromosome numbers
#with 'chr' (e.g. chr1) and linkage group numbers with 'LG' (e.g. LG3). Contigs
#or scaffolds may have any identifier that is unique within the assembly.
#2 object_beg The starting coordinates of the component/gap on the object in
#column 1. These are the location in the object's coordinate system, not the
#component's.
#3 object_end The ending coordinates of the component/gap on the object in
#column 1. These are the location in the object's coordinate system, not the
#component's.
#4 part_number The line count for the components/gaps that make up the object
#described in column 1.
#5 component_type The sequencing status of the component. These typically
#correspond to keywords in the International Sequence Database
#(GenBank/EMBL/DDBJ) submission. Current acceptable values are:
#The sequencing status of the component. These typically correspond to keywords
#in the International Sequence Database (GenBank/EMBL/DDBJ) submission. Current
#acceptable values are:
#A Active Finishing
#D Draft HTG (often phase1 and phase2 are called Draft, whether or not they have the draft keyword).
#F Finished HTG (phase3)
#G Whole Genome Finishing
#O Other sequence (typically means no HTG keyword)
#P Pre Draft
#W WGS contig
#N gap with specified size
#U gap of unknown size, defaulting to 100 bases.
#6a component_id If column 5 not equal to N or U: This is a unique identifier
#for the sequence component contributing to the object described in column 1.
#Ideally this will be a valid accession.version identifier as assigned by
#GenBank/EMBL/DDBJ. If the sequence has not been submitted to a public
#repository yet a local identifier should be used.
#6b gap_length If column 5 equal to N or U: This column represents the length of
#the gap. N type gaps can be of any length. A length of 100 must be used for all
#U type gaps.
#7a component_beg If column 5 not equal to N or U: This column specifies the
#beginning of the part of the component sequence that contributes to the object
#in column 1 (in component coordinates).
#7b gap_type
# If column 5 equal to N or U: This column specifies the gap type.
# Accepted values:
# scaffold: a gap between two sequence contigs in a scaffold (superscaffold or ultra-scaffold).
# contig: an unspanned gap between two sequence contigs.
# centromere: a gap inserted for the centromere.
# short_arm: a gap inserted at the start of an acrocentric chromosome.
# heterochromatin: a gap inserted for an especially large region of heterochromatic sequence (may also include the centromere).
# telomere: a gap inserted for the telomere.
# repeat: an unresolvable repeat.
#8a component_end If column 5 not equal to N or U: This column specifies the end
#of the part of the component that contributes to the object in column 1 (in
#component coordinates).
#8b linkage If column 5 equal to N or U: This column indicates if there is evidence of linkage between the adjacent lines.
# Values: yes, no
#9a orientation
# If column 5 not equal to N or U: This column specifies the orientation of the component relative to the object in column 1.
# Values:
# + plus
# - minus
# ? unknown
# 0 (zero) unknown (deprecated)
# na irrelevant
# By default, components with unknown orientation (?, 0 or na) are treated as if they had + orientation.
#9b Linkage evidence If column 5 equal to N or U: This specifies the type of
#evidence used to assert linkage (as indicated in column 8b).
# Accepted values:
# na used when no linkage is being asserted (column 8b is 'no')
# paired-ends paired sequences from the two ends of a DNA fragment.
# align_genus alignment to a reference genome within the same genus.
# align_xgenus alignment to a reference genome within another genus.
# align_trnscpt alignment to a transcript from the same species.
# within_clone sequence on both sides of the gap is derived from the same
# clone, but the gap is not spanned by paired-ends. The adjacent sequence
# contigs have unknown order and orientation.
# clone_contig linkage is provided by a clone contig in the tiling path
# (TPF). For example, a gap where there is a known clone, but there is
# not yet sequence for that clone.
# map linkage asserted using a non-sequence based map such as RH, linkage, fingerprint or optical.
# strobe strobe sequencing (PacBio).
# unspecified used only when converting old AGPs that lack a field for linkage evidence into the new format.
# If there are multiple lines of evidence to support linkage, all can be
# listed using a ';' delimiter (e.g. paired-ends;align_xgenus).
if
__name__
==
'__main__'
:
main
(
sys
.
argv
[
1
:])
\ No newline at end of file
om_to_gff.py
View file @
e3d73b88
...
...
@@ -86,7 +86,7 @@ def main(args):
infile
=
args
.
infile
chromosome_names
=
args
.
names
exclude_cols
=
args
.
exclude_cols
oufile
=
infile
+
".gff"
oufile
=
infile
+
".gff
3
"
source_name
=
"IrysView"
feature_name_full
=
"optical_contig"
feature_name_piece
=
"optical_contig_piece"
...
...
run.sh
View file @
e3d73b88
...
...
@@ -4,7 +4,7 @@ infile=data/S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt
augmented
=
$infile
.augmented.tsv
filtered
=
${
augmented
}
_Confidence_ge_10.0__meta_num_orientations_gt_1__meta_is_max_confidence_for_qry_chrom_eq_True.report.tsv
delta
=
$filtered
.delta
gff
=
$filtered
.gff
gff
=
$filtered
.gff
3
cols_to_escape
=
info/gff_cols_to_escape.txt
chromosome_names
=
S_lycopersicum_chromosomes.2.50.chromosome_names.txt
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment