Ground truth parser now includes percent identity and alignment length

535c250f · Noordijk, Ben · 8e404a0b · 535c250f
Commit 535c250f authored 3 years ago by Noordijk, Ben
--- a/tools/set_ground_truths_of_reads.py
+++ b/tools/set_ground_truths_of_reads.py
@@ -5,36 +5,45 @@ import Bio.Blast.NCBIXML
 import multiprocessing


-def get_species_and_read_id_from_xml(xml_path):
-    """given path to xml blast output file,
-     return best hit species and read id as string"""
-    with open(xml_path, 'r') as f:
-        blast_hit = Bio.Blast.NCBIXML.read(f)
-
-    read_id = blast_hit.query.split()[0]
-    # Only get species from blast hits that produced alignments
-    species = blast_hit.alignments[0].hit_def if blast_hit.alignments else None
-    return species, read_id
-
-
-def parse_one_xml(file_path):
-    """Given path to one xml, get file name of associated fast5
+def parse_blast_xml(file_path):
+    """Given path to one blast xml output, extract information that
+    will be used in ground truth table

    :param file_path: Path object to single file
    :type file_path: Path
-    :return:
+    :return: tuple with file_name, read_id, species,
+             percent identity of best hsp, alignment length of best hsp,
+             species_id
    """
-    species, read_id = get_species_and_read_id_from_xml(file_path)
+    with open(file_path, 'r') as f:
+        blast = Bio.Blast.NCBIXML.read(f)
+
    file_name = file_path.name.replace('_0.xml', '.fast5')
-    species_id = species.split()[0] if species else None
-    # print(file_name, species)
-    return file_name, read_id, species, species_id
+    read_id = blast.query.split()[0]
+
+    # Only get species from blast hits that produced alignments
+    if blast.alignments:
+        alignment = blast.alignments[0]
+
+        species = alignment.hit_def
+        species_id = species.split()[0]
+
+        # Percent identity of best hsp
+        align_len = alignment.hsps[0].align_length
+        no_identities = alignment.hsps[0].identities
+        perc_id = no_identities / align_len
+    else:
+        species = perc_id = species_id = align_len = None
+
+    return file_name, read_id, species, perc_id, align_len, species_id
+


 def main():
    parser = argparse.ArgumentParser(description="""Create csv with file names 
    and species they belong to from xml blast output""")
-    parser.add_argument('--in-path', help='Path to directory of blast xml output',
+    parser.add_argument('--in-path', help='Path to directory with blast '
+                                          'xml output files',
                        required=True)
    parser.add_argument('--out-path', help='Path to save the csv',
                        required=True)
@@ -47,7 +56,7 @@ def main():
    # # Debugging thing here (uncomment to only look at 500 files)
    # all_files = [next(all_files) for _ in range(500)]
    with multiprocessing.Pool(args.num_workers) as p:
-        ground_truths = p.map(parse_one_xml, all_files)
+        ground_truths = p.map(parse_blast_xml, all_files)
    df = pd.DataFrame.from_records(ground_truths, columns=['file name',
                                                           'read id',
                                                           'species',