Commit 01da4142 authored by sauloal's avatar sauloal
Browse files

to gff

parent 76c3b086
......@@ -2,5 +2,6 @@
*.delta
*.tsv
*.pyc
*.gff
.~*
out*
......@@ -7,7 +7,7 @@ from om_shared import *
def parse_args(args):
parser = argparse.ArgumentParser(description="Bionano Genomics MAP parser")
parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP filter")
parser.add_argument( 'infile', help="AUGMENTED file" )
parser.add_argument( '-l' , '--list' , action='store_true' , help="List Values" )
parser.add_argument( '-f' , '--filter' , action='append' , help="Filters [Field:Function(lt, le, eq, ne, ge, gt):Value]" )
......
......@@ -15,6 +15,7 @@ cols_to_index = ["QryContigID", "RefContigID", "Orientation"]
group_by = [
["RefContigID", "QryContigID"],
["RefContigID", "RefStartPos"],
["RefContigID", "RefEndPos" ],
["QryContigID", "RefContigID"],
["QryContigID", "Orientation"],
["QryContigID", "XmapEntryID"],
......
......@@ -7,7 +7,7 @@ from om_shared import *
def parse_args(args):
parser = argparse.ArgumentParser(description="Bionano Genomics MAP parser")
parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP to Mummerplot Delta converter")
parser.add_argument( 'infile', help="AUGMENTED file" )
args = parser.parse_args(args=args)
......
#!/usr/bin/python
import os
import sys
from om_shared import *
def parse_args(args):
parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP to GFF converter")
parser.add_argument( 'infile', help="AUGMENTED file" )
parser.add_argument( '-n' , '--names', action='store', help="Names of reference chromosome. Eg: Ch0|Ch1|Ch3 Ch0,Ch1,Ch2 Ch0:Ch1:Ch2" )
parser.add_argument( '-f' , '--names-from-file', action='store', help="File containing names of reference chromosome. One per line" )
parser.add_argument( '-s' , '--sep' , '--separator', default=",", help="Separator for chromosome names. Eg: | , :" )
##genome-build source buildName
##species NCBI_Taxonomy_URI
args = parser.parse_args(args=args)
if args.names is None and args.names_from_file is None:
print "either --names or --names-from-file has to be defined"
sys.exit(1)
if args.names_from_file is not None:
if not os.path.exists(args.names_from_file):
print "names file %s does not exists" % args.names_from_file
sys.exit(1)
if os.path.isdir(args.names_from_file):
print "names file %s is a folder" % args.names_from_file
sys.exit(1)
args.names = []
with open(args.names_from_file, 'r') as fhd:
for line in fhd:
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
continue
args.names.append(line)
elif args.names is not None:
args.names = args.names.split( args.sep )
return args
def main(args):
valid_fields = gen_valid_fields(valid_fields_g)
infile = args.infile
chromosome_names = args.names
oufile = infile + ".gff"
source_name = "IrysView"
feature_name = "optical_contig"
if not os.path.exists(infile):
print "input file %s does not exists" % infile
sys.exit(1)
if os.path.isdir(infile):
print "input file %s is a folder" % infile
sys.exit(1)
print "saving to %s" % oufile
data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields)
print "NAMES" , names
#print "HEADERS", "\n".join( headers )
print "TYPES" , types
#print "DATA" , data[1]
#print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]]
print "file has %5d maps and %3d chromosomes" % (len(indexer["QryContigID"]), len(indexer["RefContigID"]))
assert len(indexer["RefContigID"] ) <= len(chromosome_names), "number of chromosome differ from %d to %d\n%s\n%s" % (len(indexer["RefContigID"] ), len(chromosome_names), indexer["RefContigID"].keys() , chromosome_names)
assert max(indexer["RefContigID"].keys()) <= len(chromosome_names), "number of chromosome differ from %d to %d" % (max(indexer["RefContigID"].keys()), len(chromosome_names))
print chromosome_names
print "CREATING GFF: ", oufile
with open(oufile, "w") as fhd:
linecount = 0
#fhd.write("/home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_lycopersicum_heinz/SL2.40ch12.fa /home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_pennellii_scaffold/final.assembly.fasta\n")
fhd.write("##gff-version 3\n")
for RefContigID in sorted(groups["RefContigID_RefStartPos"]):
RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID]
RefEndPoses = groups["RefContigID_RefEndPos" ][RefContigID]
ref_min_pos = min( [min(RefEndPoses), min(RefStartPoses)] )
ref_max_pos = max( [max(RefEndPoses), max(RefStartPoses)] )
fhd.write("##sequence-region %s %d %d\n" % (chromosome_names[RefContigID-1], int(ref_min_pos), int(ref_max_pos)))
data = [ KeyedTuple(x, labels=names)._asdict() for x in data ]
for RefContigID in sorted(groups["RefContigID_RefStartPos"]):
RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID]
for RefStartPosG in sorted(RefStartPoses):
pos_rows = list(RefStartPoses[RefStartPosG])
for pos_row_pos in pos_rows:
pos_row = data[pos_row_pos]
QryContigID = pos_row["QryContigID"]
RefStartPos = pos_row["RefStartPos"]
RefEndPos = pos_row["RefEndPos" ]
RefLen = pos_row["RefLen" ]
QryStartPos = pos_row["QryStartPos"]
QryEndPos = pos_row["QryEndPos" ]
QryLen = pos_row["QryLen" ]
Confidence = pos_row["Confidence" ]
Orientation = pos_row["Orientation"]
HitEnum = pos_row["HitEnum" ]
attributes_keys = [
[ 'ID' , QryContigID ],
[ 'Name', QryContigID ],
[ 'Gap' , HitEnum ],
]
for k in sorted(pos_row):
attributes_keys.append( [ k.lower(), pos_row[k] ] )
attributes = ";".join("=".join([k,str(v)]) for k,v in attributes_keys)
#http://www.ensembl.org/info/website/upload/gff.html
# seqname source feature start end score strand frame attribute
line = [ chromosome_names[RefContigID-1], source_name, feature_name, int(RefStartPos), int(RefEndPos), Confidence, Orientation, '.', attributes]
fhd.write( "\t".join([str(x) for x in line]) + "\n")
print
if __name__ == '__main__':
if len(sys.argv) ==1:
print "no arguments given"
sys.exit(1)
args = parse_args(sys.argv[1:])
main(args)
\ No newline at end of file
set -xeu
infile=S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt
augmented=$infile.augmented.tsv
filtered=${augmented}_Confidence_ge_10.0__meta_num_orientations_gt_1__meta_is_max_confidence_for_qry_chrom_eq_True.report.tsv
delta=$filtered.delta
gff=$filtered.gff
if [[ ! -f "$augmented" ]]; then
./om_augmenter.py $infile -g -c
fi
if [[ ! -f "$filtered" ]]; then
./om_filter.py $augmented --filter Confidence:ge:10 --filter _meta_num_orientations:gt:1 --filter _meta_is_max_confidence_for_qry_chrom:eq:T
fi
rm *.delta
if [[ ! -f "$delta" ]]; then
./om_to_delta.py $filtered
fi
rm *.gff || true
if [[ ! -f "$gff" ]]; then
./om_to_gff.py $filtered
fi
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment