Skip to content
Snippets Groups Projects
Commit 327a1d89 authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

Implemented first version of squiggle data preparation script

parent b8d346ff
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
"""This script takes path to list of training files as input and outputs this
into two lists: one list with positive reads (i.e. reads that map to the
target) and negative reads"""
from pathlib import Path
import pandas as pd
def main():
path_to_train_files = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/fold0/train_reads.txt')
path_to_ground_truth = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
out_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking/output_16s_files/squigglenet/test')
target = 'Pseudomonas aeruginosa'
ground_truth = pd.read_csv(path_to_ground_truth)
with open(path_to_train_files, 'r') as f:
train_files = f.read().splitlines()
# Keep only train files ground truth
train_data_ground_truth = ground_truth[ground_truth['file name'].isin(train_files)]
# Split reads into positive and negative examples
positive_reads_idx = train_data_ground_truth['species'].apply(
lambda x: True if x.find(target) > 0 else False)
pos_reads = train_data_ground_truth[positive_reads_idx]['read id']
neg_reads = train_data_ground_truth[~positive_reads_idx]['read id']
pos_reads.to_csv(out_path / 'pos_reads.txt', header=False, index=False)
neg_reads.to_csv(out_path / 'neg_reads.txt', header=False, index=False)
if __name__ == '__main__':
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment