Added basic script that plots runtime of benchmark

5378f7bc · Noordijk, Ben · b0e5ed2f · 5378f7bc · 5378f7bc
Commit 5378f7bc authored 3 years ago by Noordijk, Ben
--- a/compare_benchmark_performance/compare_accuracy.py
+++ b/compare_benchmark_performance/compare_accuracy.py
@@ -99,6 +99,7 @@ def parse_paf(paf_path, ground_truth, plot_cm=False):


 def main():
+    # For debugging
    pd.options.display.width = 0
    parser = argparse.ArgumentParser(description="""Plot accuracy of
     benchmarked algorithms""")
@@ -119,17 +120,16 @@ def main():
    args = parser.parse_args()

    ground_truth = pd.read_csv(args.ground_truth)
-
-    # Get all files
+    
+    # Parsing accuracy files
    deepnano_files = args.benchmark_path.glob("output_16s_files/deepnano/*/minimap2_deepnano_fold*.paf")
    uncalled_files = args.benchmark_path.glob("output_16s_files/uncalled/*/uncalled_out_fold*.paf")
    guppy_files = args.benchmark_path.glob("output_16s_files/guppy/*/fold?/sequencing_summary.txt")

-    # Parse all files
    with Pool(30) as p:
-        args = [[file, ground_truth] for file in chain(deepnano_files,
-                                                       uncalled_files)]
-        uncalled_deepnano_results = p.starmap(parse_paf, args)
+        args_uncalled_deepnano = [[file, ground_truth] for file
+                                  in chain(deepnano_files, uncalled_files)]
+        uncalled_deepnano_results = p.starmap(parse_paf, args_uncalled_deepnano)

        args_guppy = [[file, ground_truth] for file in guppy_files]
        guppy_results = p.starmap(parse_sequencing_summary, args_guppy)
@@ -146,10 +146,6 @@ def main():
    all_species = df.species.unique()
    all_species.sort()

-    # # Do basic plotting
-    # df.groupby(['species', 'tool']).mean().plot.bar()
-    # plt.tight_layout()
-    # plt.show()

    sns.catplot(x='species', y='accuracy', ci='sd', data=df, hue='tool',
                kind='bar', order=all_species, legend_out=False, aspect=2)

--- a/compare_benchmark_performance/compare_run_times.py
+++ b/compare_benchmark_performance/compare_run_times.py
-"""STUB"""
\ No newline at end of file
+import argparse
+from datetime import timedelta
+from pathlib import Path
+from multiprocessing import Pool
+from itertools import chain
+
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.dates import date2num
+import seaborn as sns
+
+def parse_benchmark_file(file_path):
+    """Extract runtime from a benchmark file output by snakemake
+
+    :param file_path: path to benchmark file
+    :type file_path: Path
+    :return: tuple of toolname, target species, fold no,
+    walltime and cpu time as timedelta object
+    """
+    print(f'Processing {file_path}')
+    fold = file_path.stem[-1]
+    target_species = file_path.parts[-2].replace('_', ' ')
+    tool = file_path.parts[-3]
+    step = None
+    if tool == 'deepnano':
+        if 'basecall' in file_path.stem:
+            step = 'basecall'
+        else:
+            step = 'minimap'
+    elif tool == 'uncalled':
+        if 'index' in file_path.stem:
+            step = 'index'
+            fold = None
+        else:
+            step = 'map'
+    df = pd.read_csv(file_path, sep='\t')
+    walltime = timedelta(seconds=float(df.s))
+    cpu_time = timedelta(seconds=float(df.cpu_time))
+
+    return tool, step, target_species, fold, walltime, cpu_time
+
+
+def main(args):
+    # For debugging:
+    pd.options.display.width = 0
+
+    # Runtime parsing
+    deepnano_basecall_benchmark_files = args.benchmark_path.glob(
+        '**/deepnano_basecall_benchmark_fold?.txt')
+    deepnano_minimap_benchmark_files = args.benchmark_path.glob(
+        '**/minimap_deepnano_benchmark_fold?.txt')
+    guppy_benchmark_files = args.benchmark_path.glob(
+        '**/guppy_benchmark_fold?.txt')
+    uncalled_map_benchmark_files = args.benchmark_path.glob(
+        '**/uncalled_map_benchmark_fold?.txt')
+
+    uncalled_index_benchmark_files = args.benchmark_path.glob(
+        '**/uncalled_index_benchmark.txt')
+
+    with Pool(30) as p:
+        all_files = chain(deepnano_basecall_benchmark_files,
+                          deepnano_minimap_benchmark_files,
+                          guppy_benchmark_files,
+                          uncalled_map_benchmark_files,
+                          uncalled_index_benchmark_files)
+
+        all_benchmarks = p.map(parse_benchmark_file, all_files)
+
+    df = pd.DataFrame.from_records(all_benchmarks,
+                                             columns=['tool',
+                                                      'step',
+                                                      'species',
+                                                      'fold',
+                                                      'walltime',
+                                                      'cpu_time'])
+    # Provisional plot:
+    # TODO Split by processing step
+    df['matplotlib_time'] = date2num(df.cpu_time)
+    sns.catplot(x=['species', 'step'], y='matplotlib_time', data=df, ci='sd',
+                hue='tool', kind='bar', legend_out=False, aspect=2)
+    plt.tight_layout()
+    plt.xticks(rotation=90)
+    plt.show()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="""Plot runtime of
+     benchmarked algorithms""")
+    parser.add_argument('--benchmark-path',
+                        help='Path to folder that contains all benchmarks '
+                             'output by the snakemake workflow. '
+                             'Should contain a folder called output_16s_files',
+                        required=True, type=Path)
+    parser.add_argument('--out-dir',
+                        help='Directory in which to save the figures and'
+                             ' csv with model performance',
+                        required=True, type=Path)
+
+    args = parser.parse_args()
+    main(args)