compare_accuracy.py now also parses guppy output and makes proper plots

9cbec9d2 · Noordijk, Ben · 8bae9293 · 9cbec9d2
Commit 9cbec9d2 authored 3 years ago by Noordijk, Ben
--- a/compare_benchmark_performance/compare_accuracy.py
+++ b/compare_benchmark_performance/compare_accuracy.py
@@ -4,6 +4,45 @@ from itertools import chain
 from multiprocessing import Pool
 from sklearn.metrics import ConfusionMatrixDisplay, f1_score, accuracy_score
 import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def parse_sequencing_summary(file_path, ground_truth, plot_cm=False):
+    """Parse a sequencing_summary.txt output by guppy
+
+    :param file_path: Path to sequencing_summary.txt. Should be in the form of
+    **/guppy/{target}/fold{ix}/sequencing_summary.txt
+    :type file_path: Path
+    :param ground_truth: Dataframe that contains ground truth, can be read from
+     csv that is output by tools.set_ground_truths_of_reads.py
+    :type ground_truth: pd.DataFrame
+    :param plot_cm: If set to true, plot confusion matrix
+    :type plot_cm: bool
+    :return: tuple with tool, target_species, fold, accuracy, f1
+    """
+    print(f'Processing {file_path}')
+    # Extract some properties from the file path
+    fold = file_path.parts[-2][-1]
+    target_species = file_path.parts[-3].replace('_', ' ')
+    tool = file_path.parts[-4]
+    assert tool == 'guppy'
+
+    df_guppy = pd.read_csv(file_path, sep='\t')
+    df_guppy.rename(columns={'read_id': 'read id'}, inplace=True)
+    df = df_guppy.merge(ground_truth, on='read id')
+
+    # Set truth to 1 if ground truth species is found
+    y_true = df['species'].apply(lambda x:
+                                        1 if x.lower().find(target_species) > 0
+                                        else 0)
+
+    y_pred = df['alignment_genome'].apply(lambda x: 0 if x == '*' else 1)
+    f1 = f1_score(y_true, y_pred)
+    accuracy = accuracy_score(y_true, y_pred)
+    if plot_cm:
+        ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
+        plt.show()
+    return tool, target_species, fold, accuracy, f1


 def parse_paf(paf_path, ground_truth, plot_cm=False):
@@ -17,7 +56,7 @@ def parse_paf(paf_path, ground_truth, plot_cm=False):
    :type ground_truth: pd.DataFrame
    :param plot_cm: If set to true, plot confusion matrix
    :type plot_cm: bool
-    :return: tuple with f1 score and accuracy
+    :return: tuple with tool, target_species, fold, accuracy, f1
    """
    headers = [
        "read id",
@@ -62,18 +101,39 @@ def main():
    ground_truth_path = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
    ground_truth = pd.read_csv(ground_truth_path)
    some_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking')
+
+    # Get all files
    deepnano_files = some_path.glob("output_16s_files/deepnano/*/minimap2_deepnano_fold*.paf")
    uncalled_files = some_path.glob("output_16s_files/uncalled/*/uncalled_out_fold*.paf")
+    guppy_files = some_path.glob("output_16s_files/guppy/*/fold?/sequencing_summary.txt")
+
+    # Parse all files
    with Pool(30) as p:
        args = [[file, ground_truth] for file in chain(deepnano_files,
                                                       uncalled_files)]
-        all_records = p.starmap(parse_paf, args)
+        uncalled_deepnano_results = p.starmap(parse_paf, args)
+
+        args_guppy = [[file, ground_truth] for file in guppy_files]
+        guppy_results = p.starmap(parse_sequencing_summary, args_guppy)
+
+    all_records = uncalled_deepnano_results + guppy_results

    df = pd.DataFrame.from_records(all_records,
                                   columns=['tool', 'species',
                                            'fold', 'accuracy', 'f1'])
    # Do basic plotting
    df.groupby(['species', 'tool']).mean().plot.bar()
+
+    plt.tight_layout()
+    plt.show()
+
+    sns.catplot(x='species', y='accuracy', ci='sd', data=df, hue='tool', kind='bar', legend_out=True)
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+    plt.show()
+
+    sns.catplot(x='species', y='f1', data=df, hue='tool', kind='bar')
+    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()