Skip to content
Snippets Groups Projects
Commit 535c250f authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

Ground truth parser now includes percent identity and alignment length

parent 8e404a0b
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
......@@ -5,36 +5,45 @@ import Bio.Blast.NCBIXML
import multiprocessing
def get_species_and_read_id_from_xml(xml_path):
"""given path to xml blast output file,
return best hit species and read id as string"""
with open(xml_path, 'r') as f:
blast_hit = Bio.Blast.NCBIXML.read(f)
read_id = blast_hit.query.split()[0]
# Only get species from blast hits that produced alignments
species = blast_hit.alignments[0].hit_def if blast_hit.alignments else None
return species, read_id
def parse_one_xml(file_path):
"""Given path to one xml, get file name of associated fast5
def parse_blast_xml(file_path):
"""Given path to one blast xml output, extract information that
will be used in ground truth table
:param file_path: Path object to single file
:type file_path: Path
:return:
:return: tuple with file_name, read_id, species,
percent identity of best hsp, alignment length of best hsp,
species_id
"""
species, read_id = get_species_and_read_id_from_xml(file_path)
with open(file_path, 'r') as f:
blast = Bio.Blast.NCBIXML.read(f)
file_name = file_path.name.replace('_0.xml', '.fast5')
species_id = species.split()[0] if species else None
# print(file_name, species)
return file_name, read_id, species, species_id
read_id = blast.query.split()[0]
# Only get species from blast hits that produced alignments
if blast.alignments:
alignment = blast.alignments[0]
species = alignment.hit_def
species_id = species.split()[0]
# Percent identity of best hsp
align_len = alignment.hsps[0].align_length
no_identities = alignment.hsps[0].identities
perc_id = no_identities / align_len
else:
species = perc_id = species_id = align_len = None
return file_name, read_id, species, perc_id, align_len, species_id
def main():
parser = argparse.ArgumentParser(description="""Create csv with file names
and species they belong to from xml blast output""")
parser.add_argument('--in-path', help='Path to directory of blast xml output',
parser.add_argument('--in-path', help='Path to directory with blast '
'xml output files',
required=True)
parser.add_argument('--out-path', help='Path to save the csv',
required=True)
......@@ -47,7 +56,7 @@ def main():
# # Debugging thing here (uncomment to only look at 500 files)
# all_files = [next(all_files) for _ in range(500)]
with multiprocessing.Pool(args.num_workers) as p:
ground_truths = p.map(parse_one_xml, all_files)
ground_truths = p.map(parse_blast_xml, all_files)
df = pd.DataFrame.from_records(ground_truths, columns=['file name',
'read id',
'species',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment