squigglenet_data_preparation.py can now be run from command line

c19ccdf6 · Noordijk, Ben · b1ce900a · c19ccdf6
Commit c19ccdf6 authored 3 years ago by Noordijk, Ben
--- a/tools/squigglenet_data_preparation.py
+++ b/tools/squigglenet_data_preparation.py
-"""This script takes path to list of training files as input and outputs this
-into two lists: one list with positive reads (i.e. reads that map to the
-target) and negative reads"""
 from pathlib import Path
 import pandas as pd
+import argparse


 def main():
-    path_to_train_files = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/fold0/train_reads.txt')
-    path_to_ground_truth = Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/ground_truth_with_read_id.csv')
-    out_path = Path('/home/noord087/lustre_link/mscthesis/benchmarking/output_16s_files/squigglenet/test')
-    target = 'Pseudomonas aeruginosa'
-    ground_truth = pd.read_csv(path_to_ground_truth)
-    with open(path_to_train_files, 'r') as f:
+    parser = argparse.ArgumentParser(description="""This script takes path to
+    list of training files as input and outputs this into two lists: one
+    list with positive reads (i.e. reads that map to the target) and one
+    list with negative reads""")
+    parser.add_argument('--train-files', required=True,
+                        help='Path to directory containing genbank files',
+                        type=Path)
+    parser.add_argument('--ground-truth',
+                        help='Path to csv with ground truth labels. '
+                             'It is output by set_ground_truths_of_reads.py',
+                        required=True, type=Path)
+    parser.add_argument('--out-dir',
+                        help='Directory in which to create the pos_reads.txt'
+                             ' and neg_reads.txt ',
+                        required=True, type=Path)
+    parser.add_argument('--target',
+                        help='Target species',
+                        required=True, type=str)
+    args = parser.parse_args()
+
+    ground_truth = pd.read_csv(args.ground_truth)
+    with open(args.train_files, 'r') as f:
        train_files = f.read().splitlines()

    # Keep only train files ground truth
    train_data_ground_truth = ground_truth[ground_truth['file name'].isin(train_files)]

+    target = args.target.replace('_', ' ')
    # Split reads into positive and negative examples
    positive_reads_idx = train_data_ground_truth['species'].apply(
-        lambda x: True if x.find(target) > 0 else False)
+        lambda x: True if x.lower().find(target) > 0 else False)
    pos_reads = train_data_ground_truth[positive_reads_idx]['read id']
    neg_reads = train_data_ground_truth[~positive_reads_idx]['read id']

-    pos_reads.to_csv(out_path / 'pos_reads.txt', header=False, index=False)
-    neg_reads.to_csv(out_path / 'neg_reads.txt', header=False, index=False)
+    print(f'{sum(positive_reads_idx)} positive examples found')
+
+    pos_reads.to_csv(args.out_dir / 'pos_reads.txt', header=False, index=False)
+    neg_reads.to_csv(args.out_dir / 'neg_reads.txt', header=False, index=False)


 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()