Skip to content
Snippets Groups Projects
Commit 9cbec9d2 authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

compare_accuracy.py now also parses guppy output and makes proper plots

parent 8bae9293
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
......@@ -4,6 +4,45 @@ from itertools import chain
from multiprocessing import Pool
from sklearn.metrics import ConfusionMatrixDisplay, f1_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
def parse_sequencing_summary(file_path, ground_truth, plot_cm=False):
"""Parse a sequencing_summary.txt output by guppy
:param file_path: Path to sequencing_summary.txt. Should be in the form of
**/guppy/{target}/fold{ix}/sequencing_summary.txt
:type file_path: Path
:param ground_truth: Dataframe that contains ground truth, can be read from
csv that is output by tools.set_ground_truths_of_reads.py
:type ground_truth: pd.DataFrame
:param plot_cm: If set to true, plot confusion matrix
:type plot_cm: bool
:return: tuple with tool, target_species, fold, accuracy, f1
"""
print(f'Processing {file_path}')
# Extract some properties from the file path
fold = file_path.parts[-2][-1]
target_species = file_path.parts[-3].replace('_', ' ')
tool = file_path.parts[-4]
assert tool == 'guppy'
df_guppy = pd.read_csv(file_path, sep='\t')
df_guppy.rename(columns={'read_id': 'read id'}, inplace=True)
df = df_guppy.merge(ground_truth, on='read id')
# Set truth to 1 if ground truth species is found
y_true = df['species'].apply(lambda x:
1 if x.lower().find(target_species) > 0
else 0)
y_pred = df['alignment_genome'].apply(lambda x: 0 if x == '*' else 1)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
if plot_cm:
ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
plt.show()
return tool, target_species, fold, accuracy, f1
def parse_paf(paf_path, ground_truth, plot_cm=False):
......@@ -17,7 +56,7 @@ def parse_paf(paf_path, ground_truth, plot_cm=False):
:type ground_truth: pd.DataFrame
:param plot_cm: If set to true, plot confusion matrix
:type plot_cm: bool
:return: tuple with f1 score and accuracy
:return: tuple with tool, target_species, fold, accuracy, f1
"""
headers = [
"read id",
......@@ -62,18 +101,39 @@ def main():
ground_truth_path = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
ground_truth = pd.read_csv(ground_truth_path)
some_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking')
# Get all files
deepnano_files = some_path.glob("output_16s_files/deepnano/*/minimap2_deepnano_fold*.paf")
uncalled_files = some_path.glob("output_16s_files/uncalled/*/uncalled_out_fold*.paf")
guppy_files = some_path.glob("output_16s_files/guppy/*/fold?/sequencing_summary.txt")
# Parse all files
with Pool(30) as p:
args = [[file, ground_truth] for file in chain(deepnano_files,
uncalled_files)]
all_records = p.starmap(parse_paf, args)
uncalled_deepnano_results = p.starmap(parse_paf, args)
args_guppy = [[file, ground_truth] for file in guppy_files]
guppy_results = p.starmap(parse_sequencing_summary, args_guppy)
all_records = uncalled_deepnano_results + guppy_results
df = pd.DataFrame.from_records(all_records,
columns=['tool', 'species',
'fold', 'accuracy', 'f1'])
# Do basic plotting
df.groupby(['species', 'tool']).mean().plot.bar()
plt.tight_layout()
plt.show()
sns.catplot(x='species', y='accuracy', ci='sd', data=df, hue='tool', kind='bar', legend_out=True)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
sns.catplot(x='species', y='f1', data=df, hue='tool', kind='bar')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment