From 818f3dd15b06cfc5d41f81c2f34beba358d39668 Mon Sep 17 00:00:00 2001
From: Ben Noordijk <ben.noordijk@wur.nl>
Date: Tue, 9 Nov 2021 17:21:06 +0100
Subject: [PATCH] Some bloody messy files and changes that were used for
 debugging

---
 CnnParameterFile_large.yaml | 22 +++++++++++++
 db_building/build_db.py     | 20 +++++++-----
 trash_debug_CNN.py          | 65 +++++++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 9 deletions(-)
 create mode 100644 CnnParameterFile_large.yaml
 create mode 100644 trash_debug_CNN.py

diff --git a/CnnParameterFile_large.yaml b/CnnParameterFile_large.yaml
new file mode 100644
index 0000000..4088360
--- /dev/null
+++ b/CnnParameterFile_large.yaml
@@ -0,0 +1,22 @@
+#########################################
+#      CONVOLUTIONAL NEURAL NETWORK     #
+#            PARAMETER FILE             #
+#########################################
+
+
+# CNN ARCHITECTURE
+nn_class: Cnn_test
+batch_norm: 0
+batch_size: 32
+dropout_keep_prob: 0.69
+eps_per_kmer_switch: 25
+max_sequence_length: 50000
+filter_width: 1000
+filters: 6
+kernel_size: 15
+learning_rate: 0.001
+num_batches: 320
+num_kmer_switches: 1
+num_layers: 3
+pool_size: 8 # Pool size to use for 1d maxpool
+threshold: 0.0
diff --git a/db_building/build_db.py b/db_building/build_db.py
index f6720e0..15ea4d8 100644
--- a/db_building/build_db.py
+++ b/db_building/build_db.py
@@ -6,6 +6,7 @@ from os.path import isdir, dirname, basename, splitext
 from shutil import rmtree
 from pathlib import Path
 from random import shuffle
+import os
 
 __location__ = dirname(Path(__file__).resolve())
 sys.path.extend([__location__, f'{__location__}/..'])
@@ -19,9 +20,9 @@ def main(args):
     if isdir(out_path):
         rmtree(out_path)
     if args.read_index:
-        read_index_df = pd.read_csv(args.read_index, index_col=0)
+        read_index_df = pd.read_csv(args.read_index)
         if args.db_type == 'train':
-            file_list = list(read_index_df.query(f'fold').fn)
+            file_list = list(read_index_df.squeeze())
         else:  # test
             file_list = list(read_index_df.query(f'fold == False').fn)
     else:
@@ -37,16 +38,17 @@ def main(args):
     nb_files = len(file_list)
     count_pct_lim = 5
     for i, file in enumerate(file_list):
+        file = os.path.join(args.fast5_in, file)
         try:
             with h5py.File(file, 'r') as f:
-                # try:
-                    tr = TrainingRead(f, normalization=args.normalization,
-                                      hdf_path=args.hdf_path,
-                                      kmer_size=kmer_size)
-                    db.add_training_read(training_read=tr,
-                                         uncenter_kmer=args.uncenter_kmer)
+                tr = TrainingRead(f, normalization=args.normalization,
+                                  hdf_path=args.hdf_path,
+                                  kmer_size=kmer_size)
+                db.add_training_read(training_read=tr,
+                                     uncenter_kmer=args.uncenter_kmer)
             if args.store_example_reads:
-                np.savez(npz_path + splitext(basename(file))[0], base_labels=tr.events, raw=tr.raw)
+                np.savez(npz_path + splitext(basename(file))[0],
+                         base_labels=tr.events, raw=tr.raw)
             if not i+1 % 10:  # Every 10 reads remove history of transactions ('pack' the database) to reduce size
                 db.pack_db()
             if db.nb_pos > args.max_nb_examples:
diff --git a/trash_debug_CNN.py b/trash_debug_CNN.py
new file mode 100644
index 0000000..a3280f8
--- /dev/null
+++ b/trash_debug_CNN.py
@@ -0,0 +1,65 @@
+import h5py
+from db_building.TrainingRead import Read, TrainingRead
+from pathlib import Path
+from inference.InferenceModel import InferenceModel
+from nns.Cnn_test import NeuralNetwork
+
+def main():
+    # Hunt for positive read
+    input_length = 1000
+    target_kmer = 'AGGAGAGT'
+
+    # for file in Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_for_initial_training/test').iterdir():
+    #     print(f'Scanning {file}')
+    #     with h5py.File(file, 'r') as h5_file:
+    #         try:
+    #             train_read = TrainingRead(h5_file, 'median',
+    #                                           'Analyses/RawGenomeCorrected_000', 8)
+    #             if [i for i in train_read.condensed_events
+    #                 if i[0] == target_kmer]:
+    #                 print(f"found in {file}")
+    #                 break
+    #         except KeyError as e:
+    #             print('Got keyerror, continuing')
+    #
+    # return
+
+    pos_read_path = Path(
+        '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_for_initial_training/test/L0144169_20181212_FAK22428_MN19628_sequencing_run_16Srhizhome_2_99947_read_129882_ch_413_strand.fast5'
+    )
+
+    neg_read_path = Path(
+        '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_for_initial_training/test/L0144169_20181212_FAK22428_MN19628_sequencing_run_16Srhizhome_2_99947_read_34522_ch_344_strand.fast5')
+    with h5py.File(pos_read_path, 'r') as f:
+        pos_read = Read(f, 'median')
+        pos_train_read = TrainingRead(f, 'median',
+                                      'Analyses/RawGenomeCorrected_000', 8)
+
+    with h5py.File(neg_read_path, 'r') as f:
+        neg_read = Read(f, 'median')
+        neg_train_read = TrainingRead(f, 'median',
+                                      'Analyses/RawGenomeCorrected_000', 8)
+
+    split_pos_read = pos_read.get_split_raw_read(input_length, stride=10)
+    split_neg_read = neg_read.get_split_raw_read(input_length, stride=10)
+    split_pos_train_read = pos_train_read.get_split_raw_read(input_length,
+                                                             stride=10)
+    split_neg_train_read = neg_train_read.get_split_raw_read(input_length,
+                                                             stride=10)
+
+    # compiled_model = InferenceModel('/home/noord087/lustre_link/mscthesis/'
+    #                                 'baseless/baseless_2_on_16s/out_model_test'
+    #                                 '.tar')
+
+    cnn = NeuralNetwork(target=target_kmer, weights='/home/noord087/lustre_link/mscthesis/baseless/baseless_2_on_16s/nns/AGGAGAGT/nn.h5')
+
+    true_idx = [ce for ce in pos_train_read.condensed_events
+                if ce[0] == target_kmer]
+    cnn.predict(split_pos_read, target_kmer)
+    cnn.predict(split_neg_read, target_kmer)
+    cnn.predict(split_pos_train_read, target_kmer)
+    cnn.predict(split_neg_train_read, target_kmer)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
-- 
GitLab