Skip to content
Snippets Groups Projects
Commit c19ccdf6 authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

squigglenet_data_preparation.py can now be run from command line

parent b1ce900a
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
"""This script takes path to list of training files as input and outputs this
into two lists: one list with positive reads (i.e. reads that map to the
target) and negative reads"""
from pathlib import Path
import pandas as pd
import argparse
def main():
path_to_train_files = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/fold0/train_reads.txt')
path_to_ground_truth = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
out_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking/output_16s_files/squigglenet/test')
target = 'Pseudomonas aeruginosa'
ground_truth = pd.read_csv(path_to_ground_truth)
with open(path_to_train_files, 'r') as f:
parser = argparse.ArgumentParser(description="""This script takes path to
list of training files as input and outputs this into two lists: one
list with positive reads (i.e. reads that map to the target) and one
list with negative reads""")
parser.add_argument('--train-files', required=True,
help='Path to directory containing genbank files',
type=Path)
parser.add_argument('--ground-truth',
help='Path to csv with ground truth labels. '
'It is output by set_ground_truths_of_reads.py',
required=True, type=Path)
parser.add_argument('--out-dir',
help='Directory in which to create the pos_reads.txt'
' and neg_reads.txt ',
required=True, type=Path)
parser.add_argument('--target',
help='Target species',
required=True, type=str)
args = parser.parse_args()
ground_truth = pd.read_csv(args.ground_truth)
with open(args.train_files, 'r') as f:
train_files = f.read().splitlines()
# Keep only train files ground truth
train_data_ground_truth = ground_truth[ground_truth['file name'].isin(train_files)]
target = args.target.replace('_', ' ')
# Split reads into positive and negative examples
positive_reads_idx = train_data_ground_truth['species'].apply(
lambda x: True if x.find(target) > 0 else False)
lambda x: True if x.lower().find(target) > 0 else False)
pos_reads = train_data_ground_truth[positive_reads_idx]['read id']
neg_reads = train_data_ground_truth[~positive_reads_idx]['read id']
pos_reads.to_csv(out_path / 'pos_reads.txt', header=False, index=False)
neg_reads.to_csv(out_path / 'neg_reads.txt', header=False, index=False)
print(f'{sum(positive_reads_idx)} positive examples found')
pos_reads.to_csv(args.out_dir / 'pos_reads.txt', header=False, index=False)
neg_reads.to_csv(args.out_dir / 'neg_reads.txt', header=False, index=False)
if __name__ == '__main__':
main()
\ No newline at end of file
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment