Expanded uncalled accuracy evaluation script

728eaa55 · Noordijk, Ben · 96817797 · 728eaa55
Commit 728eaa55 authored 3 years ago by Noordijk, Ben
--- a/compare_benchmark_performance/compare_accuracy.py
+++ b/compare_benchmark_performance/compare_accuracy.py
@@ -22,23 +22,37 @@ def read_paf(file_path):
    return pd.read_csv(file_path, sep='\t', names=headers, usecols=range(12))


-def main():
-    pd.options.display.width = 0
-    uncalled_paf_out_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking/output_16s_files/uncalled')
-    ground_truth_path = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
+def parse_uncalled(paf_path, ground_truth_path, plot_cm=False):
+    """Parse single paf output of uncalled
+
+    :param paf_path: path to .paf file output by uncalled. Should be mapped
+    to single species reference genome. In this case Pseudomonas aeruginosa.
+    :param ground_truth_path: Path to csv that contains ground truth, this csv
+     can be output by tools.set_ground_truths_of_reads.py
+    :param plot_cm: If set to true, plot confusion matrix
+    :type plot_cm: bool
+    :return: tuple with f1 score and accuracy
+    """
    ground_truth = pd.read_csv(ground_truth_path)
-
-    uncalled_paf = read_paf(uncalled_paf_out_path / 'uncalled_out_fold0.paf')
+    uncalled_paf = read_paf(paf_path)
    merged_df = uncalled_paf.merge(ground_truth, on='read id')
-    merged_df['y true'] = merged_df['species'].apply(lambda x: 1 if x.find('Pseudomonas aeruginosa') > 0 else 0)
-    merged_df['y pred'] = merged_df['Target sequence name'].apply(lambda x: 0 if x == '*' else 1)
-    f1 = f1_score(merged_df['y true'], merged_df['y pred'])
+    y_true = merged_df['species'].apply(lambda x: 1 if x.find('Pseudomonas aeruginosa') > 0 else 0)
+    y_pred = merged_df['Target sequence name'].apply(lambda x: 0 if x == '*' else 1)
+    f1 = f1_score(y_true, y_pred)
+    accuracy = accuracy_score(y_true, y_pred)
    print(f'{f1=}')
-    ConfusionMatrixDisplay.from_predictions(merged_df['y true'], merged_df['y pred'])
-    plt.show()
-
+    print(f'{accuracy=}')
+    if plot_cm:
+        ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
+        plt.show()
+    return f1, accuracy


+def main():
+    pd.options.display.width = 0
+    uncalled_paf_out_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking/output_16s_files/uncalled/uncalled_out_fold0.paf')
+    ground_truth_path = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
+    parse_uncalled(uncalled_paf_out_path, ground_truth_path)


 if __name__ == '__main__':