Skip to content
Snippets Groups Projects
Commit b6b7cada authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

Ground truth now contains column with species id instead of full species description

parent cbba1075
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
......@@ -26,8 +26,9 @@ def parse_one_xml(file_path):
"""
species = get_species_from_xml(file_path)
file_name = file_path.name.replace('_0.xml', '.fast5')
species_id = species.split()[0] if species else None
# print(file_name, species)
return file_name, species
return file_name, species, species_id
def main():
......@@ -48,7 +49,8 @@ def main():
with multiprocessing.Pool(args.num_workers) as p:
ground_truths = p.map(parse_one_xml, all_files)
df = pd.DataFrame.from_records(ground_truths, columns=['file name',
'species'])
'species',
'species id'])
df.to_csv(args.out_path, index=False)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment