Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Lannoy, Carlos de
baseLess
Commits
ddcc808f
Commit
ddcc808f
authored
Nov 14, 2022
by
Noordijk, Ben
Browse files
Added analysis for guppy accuracy over time
parent
f86c2cf2
Changes
1
Hide whitespace changes
Inline
Side-by-side
tools/basecalling_classification_over_time.py
0 → 100644
View file @
ddcc808f
from
compare_benchmark_performance.compare_accuracy_per_read
\
import
parse_sequencing_summary
,
get_performance_metrics_from_predictions
import
argparse
from
pathlib
import
Path
import
pandas
as
pd
import
seaborn
as
sns
import
matplotlib.pyplot
as
plt
def
calculate_guppy_performance_over_time
(
df
:
pd
.
DataFrame
,
read_interval
:
int
=
10
):
"""Calculate guppy performance for an increasing number of reads.
Needs a dataframe with columns y_true and y_pred.
Where y_pred indicates if the read is assigned a species by guppy, and
y_true represents the ground truth identity of the read"""
# Shuffle dataframe
df
=
df
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
result_dict
=
{
"nr_reads"
:
[],
"f1_score"
:
[],
"acc_score"
:
[]}
for
read_cutoff
in
range
(
read_interval
,
len
(
df
),
read_interval
):
# Get subset of all reads
completed_reads
=
df
.
iloc
[:
read_cutoff
]
y_true
=
completed_reads
[
'y_true'
]
y_pred
=
completed_reads
[
'y_pred'
]
# Calculate performance on this subset of reads
acc
,
_
,
f1
=
get_performance_metrics_from_predictions
(
y_pred
,
y_true
)
result_dict
[
'nr_reads'
].
append
(
read_cutoff
)
result_dict
[
'f1_score'
].
append
(
f1
)
result_dict
[
'acc_score'
].
append
(
acc
)
result_df
=
pd
.
DataFrame
(
result_dict
)
return
result_df
def
main
(
args
):
# Workflow for analysis of guppy on bacterial classification
pd
.
options
.
display
.
width
=
0
ground_truth
=
pd
.
read_csv
(
args
.
ground_truth
)
guppy_files
=
args
.
benchmark_path
.
glob
(
"guppy/*/fold?/sequencing_summary.txt"
)
result_df_list
=
[]
if
args
.
input_performance_csv
:
# If precalculated csv is provided, use this.
result_df
=
pd
.
read_csv
(
args
.
input_performance_csv
)
else
:
# No precalculated performance CSV was found: calculate performance here
for
i
,
guppy_file
in
enumerate
(
guppy_files
):
target_species
=
guppy_file
.
parts
[
-
3
].
replace
(
'_'
,
' '
)
if
'gingivalis'
in
target_species
:
# Was not present, so can be ignored
continue
# This is the bacteria-specific part. You should replace this
# with something that works for you.
df
=
parse_sequencing_summary
(
guppy_file
,
ground_truth
,
return_df
=
True
)
temp_result_df
=
calculate_guppy_performance_over_time
(
df
)
temp_result_df
[
'species'
]
=
target_species
result_df_list
.
append
(
temp_result_df
)
# Join results of all k-fold CVs for all species
result_df
=
pd
.
concat
(
result_df_list
,
ignore_index
=
True
)
result_df
.
to_csv
(
args
.
out_dir
/
'guppy_performance_over_time.csv'
)
# Plot one lineplot where different species are different hues
sns
.
set_style
(
'darkgrid'
)
sns
.
lineplot
(
data
=
result_df
,
x
=
'nr_reads'
,
y
=
'f1_score'
,
hue
=
'species'
)
plt
.
tight_layout
()
plt
.
savefig
(
args
.
out_dir
/
'guppy_accuracy_lineplot_hue.svg'
)
# Print one separate lineplot in a grid for each different species
all_species
=
result_df
.
species
.
unique
()
all_species
.
sort
()
sns
.
set_context
(
'talk'
)
sns
.
relplot
(
data
=
result_df
,
x
=
'nr_reads'
,
y
=
'f1_score'
,
col
=
'species'
,
kind
=
'line'
,
col_wrap
=
5
,
col_order
=
all_species
)
plt
.
tight_layout
()
plt
.
savefig
(
args
.
out_dir
/
'guppy_accuracy_lineplot_grid.svg'
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
"""Plot accuracy of
guppy for different number of reads it has seen"""
)
parser
.
add_argument
(
'--benchmark-path'
,
help
=
'Path to folder that should contain a folder '
'called "guppy" which contains sequencing summary'
,
required
=
True
,
type
=
Path
)
parser
.
add_argument
(
'--ground-truth'
,
help
=
'Path to csv with ground truth labels. '
'It is output by set_ground_truths_of_reads.py'
,
required
=
True
,
type
=
Path
)
parser
.
add_argument
(
'--input-performance-csv'
,
help
=
'Path to csv that contains model performance as '
'output by earlier run of this script. '
'If provided, the script will not recalculate '
'all performance statistics manually but '
'read them from this csv.'
,
required
=
False
,
type
=
Path
)
parser
.
add_argument
(
'--out-dir'
,
help
=
'Directory in which to save the figures and'
' csv with model performance'
,
required
=
True
,
type
=
Path
)
args
=
parser
.
parse_args
()
main
(
args
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment