Implemented first version of squiggle data preparation script

327a1d89 · Noordijk, Ben · b8d346ff · 327a1d89
Commit 327a1d89 authored 3 years ago by Noordijk, Ben
--- a/tools/squigglenet_data_preparation.py
+++ b/tools/squigglenet_data_preparation.py
+"""This script takes path to list of training files as input and outputs this
+into two lists: one list with positive reads (i.e. reads that map to the
+target) and negative reads"""
+from pathlib import Path
+import pandas as pd
+
+
+def main():
+    path_to_train_files = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/fold0/train_reads.txt')
+    path_to_ground_truth = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
+    out_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking/output_16s_files/squigglenet/test')
+    target = 'Pseudomonas aeruginosa'
+    ground_truth = pd.read_csv(path_to_ground_truth)
+    with open(path_to_train_files, 'r') as f:
+        train_files = f.read().splitlines()
+
+    # Keep only train files ground truth
+    train_data_ground_truth = ground_truth[ground_truth['file name'].isin(train_files)]
+
+    # Split reads into positive and negative examples
+    positive_reads_idx = train_data_ground_truth['species'].apply(
+        lambda x: True if x.find(target) > 0 else False)
+    pos_reads = train_data_ground_truth[positive_reads_idx]['read id']
+    neg_reads = train_data_ground_truth[~positive_reads_idx]['read id']
+
+    pos_reads.to_csv(out_path / 'pos_reads.txt', header=False, index=False)
+    neg_reads.to_csv(out_path / 'neg_reads.txt', header=False, index=False)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file