Commit df5135ad authored by sauloal's avatar sauloal
Browse files

gff with spaces

parent fbd7b5fa
......@@ -88,7 +88,12 @@ def main(args):
exclude_cols = args.exclude_cols
oufile = infile + ".gff"
source_name = "IrysView"
feature_name = "optical_contig"
feature_name_full = "optical_contig"
feature_name_piece = "optical_contig_piece"
feature_name_full1 = "gene"
feature_name_full2 = "mRNA"
feature_name_piece = "CDS"
if not os.path.exists(infile):
......@@ -138,49 +143,171 @@ def main(args):
data = [ KeyedTuple(x, labels=names)._asdict() for x in data ]
done_QryContigID = {}
for RefContigID in sorted(groups["RefContigID_RefStartPos"]):
RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID]
RefLen = 0
for RefStartPosG in sorted(RefStartPoses):
pos_rows = list(RefStartPoses[RefStartPosG])
pos_rows = list(RefStartPoses[RefStartPosG])
for pos_row_pos in pos_rows:
pos_row = data[pos_row_pos]
pos_row = data[pos_row_pos]
QryContigID = pos_row["QryContigID"]
QryContigID = pos_row["QryContigID"]
RefStartPos = pos_row["RefStartPos"]
RefEndPos = pos_row["RefEndPos" ]
RefLen = pos_row["RefLen" ]
if QryContigID not in done_QryContigID:
done_QryContigID[QryContigID] = {}
QryStartPos = pos_row["QryStartPos"]
QryEndPos = pos_row["QryEndPos" ]
QryLen = pos_row["QryLen" ]
first_time = True
Confidence = pos_row["Confidence" ]
Orientation = pos_row["Orientation"]
HitEnum = pos_row["HitEnum" ]
attributes_keys = [
[ 'ID' , QryContigID ],
[ 'Name', QryContigID ],
[ 'Gap' , HitEnum ],
]
if RefContigID in done_QryContigID[QryContigID]:
continue
for k in sorted(pos_row):
if k in exclude_cols:
continue
attributes_keys.append( [ k.lower(), pos_row[k] ] )
else:
done_QryContigID[QryContigID][RefContigID] = True
qry_rows = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
ref_lens = [ ( data[x]["RefStartPos"], data[x]["RefEndPos"] ) for x in qry_rows ]
qry_lens = [ ( data[x]["QryStartPos"], data[x]["QryEndPos"] ) for x in qry_rows ]
ref_no_gap_len = sum( [ max(x)-min(x) for x in ref_lens ] )
ref_min_coord = min( [ min(x) for x in ref_lens ] )
ref_max_coord = max( [ max(x) for x in ref_lens ] )
qry_no_gap_len = sum( [ max(x)-min(x) for x in qry_lens ] )
qry_min_coord = min( [ min(x) for x in qry_lens ] )
qry_max_coord = max( [ max(x) for x in qry_lens ] )
chromosome_name = chromosome_names[RefContigID-1]
attributes_keys_G = [
[ 'ID' , "%s_%d" % ( chromosome_name, QryContigID ) ],
[ 'Name', "%s_%d" % ( chromosome_name, QryContigID ) ]
]
for filter_data in filters:
attributes_keys.append( [ "_meta_filter_"+filter_data[0].lower(), filter_data[1] + '_' + str(filter_data[3]) ] )
attributes_keys_G.append( [ "_meta_filter_"+filter_data[0].lower(), filter_data[1] + '_' + str(filter_data[3]) ] )
attributes_G = ";".join("=".join([k,str(v)]) for k,v in attributes_keys_G)
line_G = [ chromosome_name, source_name, feature_name_full1, int(ref_min_coord), int(ref_max_coord), '.', '.', '.', attributes_G]
fhd.write( "\t".join([str(x) for x in line_G]) + "\n")
attributes = ";".join("=".join([k,str(v)]) for k,v in attributes_keys)
#http://www.ensembl.org/info/website/upload/gff.html
# seqname source feature start end score strand frame attribute
line = [ chromosome_names[RefContigID-1], source_name, feature_name, int(RefStartPos), int(RefEndPos), Confidence, Orientation, '.', attributes]
fhd.write( "\t".join([str(x) for x in line]) + "\n")
attributes_keys_G = [
[ 'ID' , "%s_%d_m" % ( chromosome_name, QryContigID ) ],
[ 'Name' , "%s_%d_m" % ( chromosome_name, QryContigID ) ],
[ 'Parent', "%s_%d" % ( chromosome_name, QryContigID ) ]
]
for filter_data in filters:
attributes_keys_G.append( [ "_meta_filter_"+filter_data[0].lower(), filter_data[1] + '_' + str(filter_data[3]) ] )
attributes_G = ";".join("=".join([k,str(v)]) for k,v in attributes_keys_G)
line_G = [ chromosome_name, source_name, feature_name_full2, int(ref_min_coord), int(ref_max_coord), '.', '.', '.', attributes_G]
fhd.write( "\t".join([str(x) for x in line_G]) + "\n")
qry_num = 1
for qry_row_pos in qry_rows:
qry_row = data[qry_row_pos]
RefStartPos = qry_row["RefStartPos"]
RefEndPos = qry_row["RefEndPos" ]
RefLen = qry_row["RefLen" ]
QryStartPos = qry_row["QryStartPos"]
QryEndPos = qry_row["QryEndPos" ]
QryLen = qry_row["QryLen" ]
Confidence = qry_row["Confidence" ]
Orientation = qry_row["Orientation"]
HitEnum = qry_row["HitEnum" ]
attributes_keys = [
[ 'ID' , "%s_%d_m_%06d_c" % ( chromosome_name, QryContigID, qry_num ) ],
[ 'Name' , "%s_%d_m_%06d_c" % ( chromosome_name, QryContigID, qry_num ) ],
[ 'Parent', "%s_%d_m" % ( chromosome_name, QryContigID ) ],
[ 'Gap' , HitEnum ],
]
for k in sorted(pos_row):
if k in exclude_cols:
continue
attributes_keys.append( [ k.lower(), pos_row[k] ] )
for filter_data in filters:
attributes_keys.append( [ "_meta_filter_"+filter_data[0].lower(), filter_data[1] + '_' + str(filter_data[3]) ] )
attributes = ";".join("=".join([k,str(v)]) for k,v in attributes_keys)
#http://www.ensembl.org/info/website/upload/gff.html
# seqname source feature start end score strand frame attribute
line = [ chromosome_name, source_name, feature_name_piece, int(RefStartPos), int(RefEndPos), Confidence, Orientation, '.', attributes]
fhd.write( "\t".join([str(x) for x in line]) + "\n")
qry_num += 1
#for RefContigID in sorted(groups["RefContigID_RefStartPos"]):
# RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID]
#
# for RefStartPosG in sorted(RefStartPoses):
# pos_rows = list(RefStartPoses[RefStartPosG])
#
# for pos_row_pos in pos_rows:
# pos_row = data[pos_row_pos]
#
# QryContigID = pos_row["QryContigID"]
#
# RefStartPos = pos_row["RefStartPos"]
# RefEndPos = pos_row["RefEndPos" ]
# RefLen = pos_row["RefLen" ]
#
# QryStartPos = pos_row["QryStartPos"]
# QryEndPos = pos_row["QryEndPos" ]
# QryLen = pos_row["QryLen" ]
#
# Confidence = pos_row["Confidence" ]
# Orientation = pos_row["Orientation"]
# HitEnum = pos_row["HitEnum" ]
#
# attributes_keys = [
# [ 'ID' , QryContigID ],
# [ 'Name', QryContigID ],
# [ 'Gap' , HitEnum ],
# ]
#
# for k in sorted(pos_row):
# if k in exclude_cols:
# continue
# attributes_keys.append( [ k.lower(), pos_row[k] ] )
#
# for filter_data in filters:
# attributes_keys.append( [ "_meta_filter_"+filter_data[0].lower(), filter_data[1] + '_' + str(filter_data[3]) ] )
#
# attributes = ";".join("=".join([k,str(v)]) for k,v in attributes_keys)
#
# #http://www.ensembl.org/info/website/upload/gff.html
# # seqname source feature start end score strand frame attribute
# line = [ chromosome_names[RefContigID-1], source_name, feature_name, int(RefStartPos), int(RefEndPos), Confidence, Orientation, '.', attributes]
# fhd.write( "\t".join([str(x) for x in line]) + "\n")
print
......
......@@ -29,3 +29,6 @@ rm $gff || true
if [[ ! -f "$gff" ]]; then
./om_to_gff.py --names-from-file S_lycopersicum_chromosomes.2.50.chromosome_names.txt --exclude-cols-from-file gff_cols_to_escape.txt $filtered
fi
./om_to_gff.py --names-from-file S_lycopersicum_chromosomes.2.50.chromosome_names.txt --exclude-cols-from-file gff_cols_to_escape.txt S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt.augmented.tsv
./om_to_gff.py --names-from-file S_lycopersicum_chromosomes.2.50.chromosome_names.txt --exclude-cols-from-file gff_cols_to_escape.txt S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt.augmented.tsv_Confidence_ge_10.0.report.tsv
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment