Skip to content
Snippets Groups Projects
Commit 5378f7bc authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

Added basic script that plots runtime of benchmark

parent b0e5ed2f
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
......@@ -99,6 +99,7 @@ def parse_paf(paf_path, ground_truth, plot_cm=False):
def main():
# For debugging
pd.options.display.width = 0
parser = argparse.ArgumentParser(description="""Plot accuracy of
benchmarked algorithms""")
......@@ -119,17 +120,16 @@ def main():
args = parser.parse_args()
ground_truth = pd.read_csv(args.ground_truth)
# Get all files
# Parsing accuracy files
deepnano_files = args.benchmark_path.glob("output_16s_files/deepnano/*/minimap2_deepnano_fold*.paf")
uncalled_files = args.benchmark_path.glob("output_16s_files/uncalled/*/uncalled_out_fold*.paf")
guppy_files = args.benchmark_path.glob("output_16s_files/guppy/*/fold?/sequencing_summary.txt")
# Parse all files
with Pool(30) as p:
args = [[file, ground_truth] for file in chain(deepnano_files,
uncalled_files)]
uncalled_deepnano_results = p.starmap(parse_paf, args)
args_uncalled_deepnano = [[file, ground_truth] for file
in chain(deepnano_files, uncalled_files)]
uncalled_deepnano_results = p.starmap(parse_paf, args_uncalled_deepnano)
args_guppy = [[file, ground_truth] for file in guppy_files]
guppy_results = p.starmap(parse_sequencing_summary, args_guppy)
......@@ -146,10 +146,6 @@ def main():
all_species = df.species.unique()
all_species.sort()
# # Do basic plotting
# df.groupby(['species', 'tool']).mean().plot.bar()
# plt.tight_layout()
# plt.show()
sns.catplot(x='species', y='accuracy', ci='sd', data=df, hue='tool',
kind='bar', order=all_species, legend_out=False, aspect=2)
......
"""STUB"""
\ No newline at end of file
import argparse
from datetime import timedelta
from pathlib import Path
from multiprocessing import Pool
from itertools import chain
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import seaborn as sns
def parse_benchmark_file(file_path):
"""Extract runtime from a benchmark file output by snakemake
:param file_path: path to benchmark file
:type file_path: Path
:return: tuple of toolname, target species, fold no,
walltime and cpu time as timedelta object
"""
print(f'Processing {file_path}')
fold = file_path.stem[-1]
target_species = file_path.parts[-2].replace('_', ' ')
tool = file_path.parts[-3]
step = None
if tool == 'deepnano':
if 'basecall' in file_path.stem:
step = 'basecall'
else:
step = 'minimap'
elif tool == 'uncalled':
if 'index' in file_path.stem:
step = 'index'
fold = None
else:
step = 'map'
df = pd.read_csv(file_path, sep='\t')
walltime = timedelta(seconds=float(df.s))
cpu_time = timedelta(seconds=float(df.cpu_time))
return tool, step, target_species, fold, walltime, cpu_time
def main(args):
# For debugging:
pd.options.display.width = 0
# Runtime parsing
deepnano_basecall_benchmark_files = args.benchmark_path.glob(
'**/deepnano_basecall_benchmark_fold?.txt')
deepnano_minimap_benchmark_files = args.benchmark_path.glob(
'**/minimap_deepnano_benchmark_fold?.txt')
guppy_benchmark_files = args.benchmark_path.glob(
'**/guppy_benchmark_fold?.txt')
uncalled_map_benchmark_files = args.benchmark_path.glob(
'**/uncalled_map_benchmark_fold?.txt')
uncalled_index_benchmark_files = args.benchmark_path.glob(
'**/uncalled_index_benchmark.txt')
with Pool(30) as p:
all_files = chain(deepnano_basecall_benchmark_files,
deepnano_minimap_benchmark_files,
guppy_benchmark_files,
uncalled_map_benchmark_files,
uncalled_index_benchmark_files)
all_benchmarks = p.map(parse_benchmark_file, all_files)
df = pd.DataFrame.from_records(all_benchmarks,
columns=['tool',
'step',
'species',
'fold',
'walltime',
'cpu_time'])
# Provisional plot:
# TODO Split by processing step
df['matplotlib_time'] = date2num(df.cpu_time)
sns.catplot(x=['species', 'step'], y='matplotlib_time', data=df, ci='sd',
hue='tool', kind='bar', legend_out=False, aspect=2)
plt.tight_layout()
plt.xticks(rotation=90)
plt.show()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="""Plot runtime of
benchmarked algorithms""")
parser.add_argument('--benchmark-path',
help='Path to folder that contains all benchmarks '
'output by the snakemake workflow. '
'Should contain a folder called output_16s_files',
required=True, type=Path)
parser.add_argument('--out-dir',
help='Directory in which to save the figures and'
' csv with model performance',
required=True, type=Path)
args = parser.parse_args()
main(args)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment