Commit fbd7b5fa authored by sauloal's avatar sauloal
Browse files

more info in gff

parent 01da4142
...@@ -36,7 +36,7 @@ def main(args): ...@@ -36,7 +36,7 @@ def main(args):
print "saving to %s" % oufile print "saving to %s" % oufile
data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields) data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from, filters_csv = parse_file(infile, valid_fields)
print "NAMES" , names print "NAMES" , names
print "TYPES" , types print "TYPES" , types
......
...@@ -34,7 +34,7 @@ def main(args): ...@@ -34,7 +34,7 @@ def main(args):
sys.exit(1) sys.exit(1)
filters = gen_filter(args, valid_fields) filters = gen_filter(args.filter, valid_fields)
oufile = infile oufile = infile
for field_name, field_operator_name, field_operator, field_value in filters: for field_name, field_operator_name, field_operator, field_value in filters:
...@@ -42,7 +42,7 @@ def main(args): ...@@ -42,7 +42,7 @@ def main(args):
print "saving to %s" % oufile print "saving to %s" % oufile
data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields) data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from, filters_csv = parse_file(infile, valid_fields)
print "NAMES" , names print "NAMES" , names
#print "HEADERS", "\n".join( headers ) #print "HEADERS", "\n".join( headers )
...@@ -156,7 +156,7 @@ def main(args): ...@@ -156,7 +156,7 @@ def main(args):
reporter.write("# FILTERS:\n") reporter.write("# FILTERS:\n")
for field_name, field_operator_name, field_operator, field_value in filters: for field_name, field_operator_name, field_operator, field_value in filters:
oufile += '_' + field_name + '_' + field_operator_name + '_' + str(field_value) oufile += '_' + field_name + '_' + field_operator_name + '_' + str(field_value)
reporter.write( "# %-39s: %3s : %s\n" % (field_name, field_operator_name, str(field_value) ) ) reporter.write( "# FILTER : %-39s: %3s : %s\n" % (field_name, field_operator_name, str(field_value) ) )
reporter.write( "\n\n" ) reporter.write( "\n\n" )
reporter.write("#h " + "\t".join( [ "%-39s" % ( x ) for x in valid_fields['names' ] ] ) + "\n") reporter.write("#h " + "\t".join( [ "%-39s" % ( x ) for x in valid_fields['names' ] ] ) + "\n")
......
...@@ -134,11 +134,11 @@ def gen_valid_fields(valid_fields): ...@@ -134,11 +134,11 @@ def gen_valid_fields(valid_fields):
return valid_fields return valid_fields
def gen_filter(args, valid_fields): def gen_filter(filter_datas, valid_fields):
filters = [] filters = []
if args.filter is not None: if filter_datas is not None:
for filter_data in args.filter: for filter_data in filter_datas:
filter_cols = filter_data.split(":") filter_cols = filter_data.split(":")
if len(filter_cols) != 3: if len(filter_cols) != 3:
...@@ -166,6 +166,7 @@ def parse_file(infile, valid_fields): ...@@ -166,6 +166,7 @@ def parse_file(infile, valid_fields):
types = [] types = []
indexer = {} indexer = {}
headers = [] headers = []
filters = []
groups = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) groups = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
ref_maps_from = "" ref_maps_from = ""
query_maps_from = "" query_maps_from = ""
...@@ -184,10 +185,18 @@ def parse_file(infile, valid_fields): ...@@ -184,10 +185,18 @@ def parse_file(infile, valid_fields):
if line[0] == "#": if line[0] == "#":
headers.append(line) headers.append(line)
if len(line) == 1: if len(line) == 1:
continue continue
if line[1] == "h": if line[2:10] == "FILTER :":
print "PARSING FILTER"
#"Confidence : ge : 10.0"
cols = [ x.strip() for x in line[2:].split(":") ]
filter_line = ":".join(cols[1:])
filters.append( filter_line )
elif line[1] == "h":
line = line[3:] line = line[3:]
names = [x.strip() for x in line.split("\t")] names = [x.strip() for x in line.split("\t")]
#print "NAMES", names #print "NAMES", names
...@@ -235,7 +244,7 @@ def parse_file(infile, valid_fields): ...@@ -235,7 +244,7 @@ def parse_file(infile, valid_fields):
data.append(vals) data.append(vals)
return data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from return data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from, filters
......
...@@ -29,7 +29,7 @@ def main(args): ...@@ -29,7 +29,7 @@ def main(args):
print "saving to %s" % oufile print "saving to %s" % oufile
data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields) data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from, filters_csv = parse_file(infile, valid_fields)
print "NAMES" , names print "NAMES" , names
#print "HEADERS", "\n".join( headers ) #print "HEADERS", "\n".join( headers )
......
...@@ -8,10 +8,12 @@ from om_shared import * ...@@ -8,10 +8,12 @@ from om_shared import *
def parse_args(args): def parse_args(args):
parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP to GFF converter") parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP to GFF converter")
parser.add_argument( 'infile', help="AUGMENTED file" ) parser.add_argument( 'infile', help="AUGMENTED file" )
parser.add_argument( '-n' , '--names', action='store', help="Names of reference chromosome. Eg: Ch0|Ch1|Ch3 Ch0,Ch1,Ch2 Ch0:Ch1:Ch2" ) parser.add_argument( '-x' , '--exclude-cols', action='store', help="Exclude column from GFF" )
parser.add_argument( '-f' , '--names-from-file', action='store', help="File containing names of reference chromosome. One per line" ) parser.add_argument( '-z' , '--exclude-cols-from-file', action='store', help="File containing names of columns to exclude from GFF" )
parser.add_argument( '-s' , '--sep' , '--separator', default=",", help="Separator for chromosome names. Eg: | , :" ) parser.add_argument( '-n' , '--names', action='store', help="Names of reference chromosome. Eg: Ch0|Ch1|Ch3 Ch0,Ch1,Ch2 Ch0:Ch1:Ch2" )
parser.add_argument( '-f' , '--names-from-file', action='store', help="File containing names of reference chromosome. One per line" )
parser.add_argument( '-s' , '--sep' , '--separator', default=",", help="Separator for chromosome names. Eg: | , :" )
##genome-build source buildName ##genome-build source buildName
##species NCBI_Taxonomy_URI ##species NCBI_Taxonomy_URI
...@@ -47,12 +49,43 @@ def parse_args(args): ...@@ -47,12 +49,43 @@ def parse_args(args):
elif args.names is not None: elif args.names is not None:
args.names = args.names.split( args.sep ) args.names = args.names.split( args.sep )
if args.exclude_cols_from_file is not None:
if not os.path.exists(args.exclude_cols_from_file):
print "columns to exclude file %s does not exists" % args.exclude_cols_from_file
sys.exit(1)
if os.path.isdir(args.exclude_cols_from_file):
print "columns to exclude file %s is a folder" % args.exclude_cols_from_file
sys.exit(1)
args.exclude_cols = []
with open(args.exclude_cols_from_file, 'r') as fhd:
for line in fhd:
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
continue
args.exclude_cols.append(line)
elif args.exclude_cols is not None:
args.exclude_cols = args.exclude_cols.split( args.sep )
else:
args.exclude_cols = []
return args return args
def main(args): def main(args):
valid_fields = gen_valid_fields(valid_fields_g) valid_fields = gen_valid_fields(valid_fields_g)
infile = args.infile infile = args.infile
chromosome_names = args.names chromosome_names = args.names
exclude_cols = args.exclude_cols
oufile = infile + ".gff" oufile = infile + ".gff"
source_name = "IrysView" source_name = "IrysView"
feature_name = "optical_contig" feature_name = "optical_contig"
...@@ -68,7 +101,9 @@ def main(args): ...@@ -68,7 +101,9 @@ def main(args):
print "saving to %s" % oufile print "saving to %s" % oufile
data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields) data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from, filters_csv = parse_file(infile, valid_fields)
filters = gen_filter(filters_csv, valid_fields)
print "NAMES" , names print "NAMES" , names
#print "HEADERS", "\n".join( headers ) #print "HEADERS", "\n".join( headers )
...@@ -80,6 +115,7 @@ def main(args): ...@@ -80,6 +115,7 @@ def main(args):
assert len(indexer["RefContigID"] ) <= len(chromosome_names), "number of chromosome differ from %d to %d\n%s\n%s" % (len(indexer["RefContigID"] ), len(chromosome_names), indexer["RefContigID"].keys() , chromosome_names) assert len(indexer["RefContigID"] ) <= len(chromosome_names), "number of chromosome differ from %d to %d\n%s\n%s" % (len(indexer["RefContigID"] ), len(chromosome_names), indexer["RefContigID"].keys() , chromosome_names)
assert max(indexer["RefContigID"].keys()) <= len(chromosome_names), "number of chromosome differ from %d to %d" % (max(indexer["RefContigID"].keys()), len(chromosome_names)) assert max(indexer["RefContigID"].keys()) <= len(chromosome_names), "number of chromosome differ from %d to %d" % (max(indexer["RefContigID"].keys()), len(chromosome_names))
print chromosome_names print chromosome_names
print "CREATING GFF: ", oufile print "CREATING GFF: ", oufile
...@@ -96,6 +132,10 @@ def main(args): ...@@ -96,6 +132,10 @@ def main(args):
ref_max_pos = max( [max(RefEndPoses), max(RefStartPoses)] ) ref_max_pos = max( [max(RefEndPoses), max(RefStartPoses)] )
fhd.write("##sequence-region %s %d %d\n" % (chromosome_names[RefContigID-1], int(ref_min_pos), int(ref_max_pos))) fhd.write("##sequence-region %s %d %d\n" % (chromosome_names[RefContigID-1], int(ref_min_pos), int(ref_max_pos)))
fhd.write( "#\n" + "\n".join([ "# XMAP "+x[1:] for x in headers[:-2] ]) + "\n#\n")
data = [ KeyedTuple(x, labels=names)._asdict() for x in data ] data = [ KeyedTuple(x, labels=names)._asdict() for x in data ]
for RefContigID in sorted(groups["RefContigID_RefStartPos"]): for RefContigID in sorted(groups["RefContigID_RefStartPos"]):
...@@ -128,8 +168,13 @@ def main(args): ...@@ -128,8 +168,13 @@ def main(args):
] ]
for k in sorted(pos_row): for k in sorted(pos_row):
if k in exclude_cols:
continue
attributes_keys.append( [ k.lower(), pos_row[k] ] ) attributes_keys.append( [ k.lower(), pos_row[k] ] )
for filter_data in filters:
attributes_keys.append( [ "_meta_filter_"+filter_data[0].lower(), filter_data[1] + '_' + str(filter_data[3]) ] )
attributes = ";".join("=".join([k,str(v)]) for k,v in attributes_keys) attributes = ";".join("=".join([k,str(v)]) for k,v in attributes_keys)
#http://www.ensembl.org/info/website/upload/gff.html #http://www.ensembl.org/info/website/upload/gff.html
......
...@@ -7,22 +7,25 @@ delta=$filtered.delta ...@@ -7,22 +7,25 @@ delta=$filtered.delta
gff=$filtered.gff gff=$filtered.gff
#rm $augmented || true
if [[ ! -f "$augmented" ]]; then if [[ ! -f "$augmented" ]]; then
./om_augmenter.py $infile -g -c ./om_augmenter.py $infile -g -c
fi fi
#rm $filtered || true
if [[ ! -f "$filtered" ]]; then if [[ ! -f "$filtered" ]]; then
./om_filter.py $augmented --filter Confidence:ge:10 --filter _meta_num_orientations:gt:1 --filter _meta_is_max_confidence_for_qry_chrom:eq:T ./om_filter.py $augmented --filter Confidence:ge:10 --filter _meta_num_orientations:gt:1 --filter _meta_is_max_confidence_for_qry_chrom:eq:T
fi fi
#rm $delta || true
if [[ ! -f "$delta" ]]; then if [[ ! -f "$delta" ]]; then
./om_to_delta.py $filtered ./om_to_delta.py $filtered
fi fi
rm *.gff || true rm $gff || true
if [[ ! -f "$gff" ]]; then if [[ ! -f "$gff" ]]; then
./om_to_gff.py $filtered ./om_to_gff.py --names-from-file S_lycopersicum_chromosomes.2.50.chromosome_names.txt --exclude-cols-from-file gff_cols_to_escape.txt $filtered
fi fi
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment