Skip to content
Snippets Groups Projects
Commit b9049d1d authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

visualise_ground_truth_file.py now also shows species abundance

parent 86393885
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
......@@ -6,21 +6,49 @@ import numpy as np
def main():
id_to_species = {
'NZ_CP053098': 'acinetobacter baumannii',
'NZ_DS264586': 'actinomyces odontolyticus',
# 'NZ_DS26458': 'schaalia odontolytica',
'AE017194': 'bacillus cereus',
'CP000139': 'bacteroides vulgatus',
'CP000721': 'clostridium beijerinckii',
'AE000513': 'deinococcus radiodurans',
'CP025020': 'enterococcus faecalis',
'U00096': 'escherichia coli',
'AE000511': 'helicobacter pylori',
'CP000413': 'lactobacillus gasseri',
'AL591824': 'listeria monocytogenes',
'AE002098': 'neisseria meningitides',
'AP009380': 'porphyromonas gingivalis',
'AE017283': 'propionibacterium acnes',
'AE004091': 'pseudomonas aeruginosa',
'CP000144': 'rhodobacter sphaeroides',
'CP000730': 'staphylococcus aureus',
'AE015929': 'staphylococcus epidermidis',
'AE009948': 'streptococcus agalactiae',
'AE014133': 'streptococcus mutans',
'AE005672': 'streptococcus pneumoniae',
}
ground_truth_csv = Path('/home/noord087/lustre_link/HoiCarlos/'
'16Sreads_mockcommunity/ground_truth_with_'
'read_id_and_perc_id.csv')
df = pd.read_csv(ground_truth_csv)
fig, axs = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
df['query length'].plot.hist(title='Query length', ax=axs[0],
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
df['query length'].plot.hist(title='Query length', ax=axs[0, 0],
bins=np.linspace(0, 3000, 50))
percent_in_alignment = df['alignment length'] / df['query length']
percent_in_alignment.plot.hist(title='Relative length of alignment relative to query',
ax=axs[1], bins=np.linspace(0.5, 1, 20))
ax=axs[0, 1], bins=np.linspace(0.5, 1, 20))
df['percent identity'].plot.hist(title='Percent identity (based on alignment length)',
ax=axs[2], bins=np.linspace(0.8, 1, 20))
ax=axs[1, 0], bins=np.linspace(0.8, 1, 20))
species_abundance = df['species id'].value_counts()
species_abundance.index = [id_to_species[id[:-2]] for id in species_abundance.index]
species_abundance.plot.bar(title='Species abundance in ground truth',
ax=axs[1, 1])
plt.tight_layout()
plt.show()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment