diff --git a/.gitignore b/.gitignore index 557d74bf61c0afc10e576221cad36e6497896c0b..a0a1d32c5ed1d5bd0e20246e910d0315529e7ee1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,6 @@ *.delta *.tsv *.pyc +*.gff .~* out* diff --git a/om_filter.py b/om_filter.py index cf6b83444b376f2c633d56465c13302b758868cd..37842d92dd7c2606b1ca821d85e18488c3b26cc2 100755 --- a/om_filter.py +++ b/om_filter.py @@ -7,7 +7,7 @@ from om_shared import * def parse_args(args): - parser = argparse.ArgumentParser(description="Bionano Genomics MAP parser") + parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP filter") parser.add_argument( 'infile', help="AUGMENTED file" ) parser.add_argument( '-l' , '--list' , action='store_true' , help="List Values" ) parser.add_argument( '-f' , '--filter' , action='append' , help="Filters [Field:Function(lt, le, eq, ne, ge, gt):Value]" ) diff --git a/om_shared.py b/om_shared.py index 17d01d8653aeb22c82e953ca9aca5837c33e797d..2c746a15d51140449ac57d6fbb5d5c1c72d1eae3 100644 --- a/om_shared.py +++ b/om_shared.py @@ -15,6 +15,7 @@ cols_to_index = ["QryContigID", "RefContigID", "Orientation"] group_by = [ ["RefContigID", "QryContigID"], ["RefContigID", "RefStartPos"], + ["RefContigID", "RefEndPos" ], ["QryContigID", "RefContigID"], ["QryContigID", "Orientation"], ["QryContigID", "XmapEntryID"], diff --git a/om_to_delta.py b/om_to_delta.py index 00c0f56c222921bc6996e16546adb1b9263f33a4..0d5e09fd58497de8bf5b6ed444201c528be1db67 100755 --- a/om_to_delta.py +++ b/om_to_delta.py @@ -7,7 +7,7 @@ from om_shared import * def parse_args(args): - parser = argparse.ArgumentParser(description="Bionano Genomics MAP parser") + parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP to Mummerplot Delta converter") parser.add_argument( 'infile', help="AUGMENTED file" ) args = parser.parse_args(args=args) diff --git a/om_to_gff.py b/om_to_gff.py new file mode 100755 index 0000000000000000000000000000000000000000..861e5311d420f60e46292db1dd625ecba9bd286c --- /dev/null +++ b/om_to_gff.py @@ -0,0 +1,151 @@ +#!/usr/bin/python + +import os +import sys + +from om_shared import * + + +def parse_args(args): + parser = argparse.ArgumentParser(description="Bionano Genomics augmented MAP to GFF converter") + parser.add_argument( 'infile', help="AUGMENTED file" ) + parser.add_argument( '-n' , '--names', action='store', help="Names of reference chromosome. Eg: Ch0|Ch1|Ch3 Ch0,Ch1,Ch2 Ch0:Ch1:Ch2" ) + parser.add_argument( '-f' , '--names-from-file', action='store', help="File containing names of reference chromosome. One per line" ) + parser.add_argument( '-s' , '--sep' , '--separator', default=",", help="Separator for chromosome names. Eg: | , :" ) + + ##genome-build source buildName + ##species NCBI_Taxonomy_URI + + args = parser.parse_args(args=args) + + if args.names is None and args.names_from_file is None: + print "either --names or --names-from-file has to be defined" + sys.exit(1) + + if args.names_from_file is not None: + if not os.path.exists(args.names_from_file): + print "names file %s does not exists" % args.names_from_file + sys.exit(1) + + if os.path.isdir(args.names_from_file): + print "names file %s is a folder" % args.names_from_file + sys.exit(1) + + args.names = [] + with open(args.names_from_file, 'r') as fhd: + for line in fhd: + line = line.strip() + + if len(line) == 0: + continue + + if line[0] == "#": + continue + + args.names.append(line) + + elif args.names is not None: + args.names = args.names.split( args.sep ) + + return args + +def main(args): + valid_fields = gen_valid_fields(valid_fields_g) + infile = args.infile + chromosome_names = args.names + oufile = infile + ".gff" + source_name = "IrysView" + feature_name = "optical_contig" + + + if not os.path.exists(infile): + print "input file %s does not exists" % infile + sys.exit(1) + + if os.path.isdir(infile): + print "input file %s is a folder" % infile + sys.exit(1) + + print "saving to %s" % oufile + + data, headers, names, seman, types, indexer, groups, ref_maps_from, query_maps_from = parse_file(infile, valid_fields) + + print "NAMES" , names + #print "HEADERS", "\n".join( headers ) + print "TYPES" , types + #print "DATA" , data[1] + #print "INDEX", indexer.keys()[0], indexer[indexer.keys()[0]] + + print "file has %5d maps and %3d chromosomes" % (len(indexer["QryContigID"]), len(indexer["RefContigID"])) + + assert len(indexer["RefContigID"] ) <= len(chromosome_names), "number of chromosome differ from %d to %d\n%s\n%s" % (len(indexer["RefContigID"] ), len(chromosome_names), indexer["RefContigID"].keys() , chromosome_names) + assert max(indexer["RefContigID"].keys()) <= len(chromosome_names), "number of chromosome differ from %d to %d" % (max(indexer["RefContigID"].keys()), len(chromosome_names)) + print chromosome_names + + print "CREATING GFF: ", oufile + with open(oufile, "w") as fhd: + linecount = 0 + #fhd.write("/home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_lycopersicum_heinz/SL2.40ch12.fa /home/assembly/nobackup/mummer/MUMmer3.23/1502/solanum_pennellii_scaffold/final.assembly.fasta\n") + + fhd.write("##gff-version 3\n") + + for RefContigID in sorted(groups["RefContigID_RefStartPos"]): + RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID] + RefEndPoses = groups["RefContigID_RefEndPos" ][RefContigID] + ref_min_pos = min( [min(RefEndPoses), min(RefStartPoses)] ) + ref_max_pos = max( [max(RefEndPoses), max(RefStartPoses)] ) + fhd.write("##sequence-region %s %d %d\n" % (chromosome_names[RefContigID-1], int(ref_min_pos), int(ref_max_pos))) + + data = [ KeyedTuple(x, labels=names)._asdict() for x in data ] + + for RefContigID in sorted(groups["RefContigID_RefStartPos"]): + RefStartPoses = groups["RefContigID_RefStartPos"][RefContigID] + + for RefStartPosG in sorted(RefStartPoses): + pos_rows = list(RefStartPoses[RefStartPosG]) + + for pos_row_pos in pos_rows: + pos_row = data[pos_row_pos] + + QryContigID = pos_row["QryContigID"] + + RefStartPos = pos_row["RefStartPos"] + RefEndPos = pos_row["RefEndPos" ] + RefLen = pos_row["RefLen" ] + + QryStartPos = pos_row["QryStartPos"] + QryEndPos = pos_row["QryEndPos" ] + QryLen = pos_row["QryLen" ] + + Confidence = pos_row["Confidence" ] + Orientation = pos_row["Orientation"] + HitEnum = pos_row["HitEnum" ] + + attributes_keys = [ + [ 'ID' , QryContigID ], + [ 'Name', QryContigID ], + [ 'Gap' , HitEnum ], + ] + + for k in sorted(pos_row): + attributes_keys.append( [ k.lower(), pos_row[k] ] ) + + attributes = ";".join("=".join([k,str(v)]) for k,v in attributes_keys) + + #http://www.ensembl.org/info/website/upload/gff.html + # seqname source feature start end score strand frame attribute + line = [ chromosome_names[RefContigID-1], source_name, feature_name, int(RefStartPos), int(RefEndPos), Confidence, Orientation, '.', attributes] + fhd.write( "\t".join([str(x) for x in line]) + "\n") + + print + + + +if __name__ == '__main__': + if len(sys.argv) ==1: + print "no arguments given" + sys.exit(1) + + args = parse_args(sys.argv[1:]) + + main(args) \ No newline at end of file diff --git a/run.sh b/run.sh index 234449273f900a18531c2e36dd9b4f321e591f80..003675ffb0292897c7fd46f37920c57f6d6b496b 100755 --- a/run.sh +++ b/run.sh @@ -1,17 +1,28 @@ +set -xeu + infile=S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt augmented=$infile.augmented.tsv filtered=${augmented}_Confidence_ge_10.0__meta_num_orientations_gt_1__meta_is_max_confidence_for_qry_chrom_eq_True.report.tsv delta=$filtered.delta +gff=$filtered.gff + if [[ ! -f "$augmented" ]]; then ./om_augmenter.py $infile -g -c fi + if [[ ! -f "$filtered" ]]; then ./om_filter.py $augmented --filter Confidence:ge:10 --filter _meta_num_orientations:gt:1 --filter _meta_is_max_confidence_for_qry_chrom:eq:T fi -rm *.delta + if [[ ! -f "$delta" ]]; then ./om_to_delta.py $filtered fi + + +rm *.gff || true +if [[ ! -f "$gff" ]]; then + ./om_to_gff.py $filtered +fi