From 818f3dd15b06cfc5d41f81c2f34beba358d39668 Mon Sep 17 00:00:00 2001 From: Ben Noordijk <ben.noordijk@wur.nl> Date: Tue, 9 Nov 2021 17:21:06 +0100 Subject: [PATCH] Some bloody messy files and changes that were used for debugging --- CnnParameterFile_large.yaml | 22 +++++++++++++ db_building/build_db.py | 20 +++++++----- trash_debug_CNN.py | 65 +++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 9 deletions(-) create mode 100644 CnnParameterFile_large.yaml create mode 100644 trash_debug_CNN.py diff --git a/CnnParameterFile_large.yaml b/CnnParameterFile_large.yaml new file mode 100644 index 0000000..4088360 --- /dev/null +++ b/CnnParameterFile_large.yaml @@ -0,0 +1,22 @@ +######################################### +# CONVOLUTIONAL NEURAL NETWORK # +# PARAMETER FILE # +######################################### + + +# CNN ARCHITECTURE +nn_class: Cnn_test +batch_norm: 0 +batch_size: 32 +dropout_keep_prob: 0.69 +eps_per_kmer_switch: 25 +max_sequence_length: 50000 +filter_width: 1000 +filters: 6 +kernel_size: 15 +learning_rate: 0.001 +num_batches: 320 +num_kmer_switches: 1 +num_layers: 3 +pool_size: 8 # Pool size to use for 1d maxpool +threshold: 0.0 diff --git a/db_building/build_db.py b/db_building/build_db.py index f6720e0..15ea4d8 100644 --- a/db_building/build_db.py +++ b/db_building/build_db.py @@ -6,6 +6,7 @@ from os.path import isdir, dirname, basename, splitext from shutil import rmtree from pathlib import Path from random import shuffle +import os __location__ = dirname(Path(__file__).resolve()) sys.path.extend([__location__, f'{__location__}/..']) @@ -19,9 +20,9 @@ def main(args): if isdir(out_path): rmtree(out_path) if args.read_index: - read_index_df = pd.read_csv(args.read_index, index_col=0) + read_index_df = pd.read_csv(args.read_index) if args.db_type == 'train': - file_list = list(read_index_df.query(f'fold').fn) + file_list = list(read_index_df.squeeze()) else: # test file_list = list(read_index_df.query(f'fold == False').fn) else: @@ -37,16 +38,17 @@ def main(args): nb_files = len(file_list) count_pct_lim = 5 for i, file in enumerate(file_list): + file = os.path.join(args.fast5_in, file) try: with h5py.File(file, 'r') as f: - # try: - tr = TrainingRead(f, normalization=args.normalization, - hdf_path=args.hdf_path, - kmer_size=kmer_size) - db.add_training_read(training_read=tr, - uncenter_kmer=args.uncenter_kmer) + tr = TrainingRead(f, normalization=args.normalization, + hdf_path=args.hdf_path, + kmer_size=kmer_size) + db.add_training_read(training_read=tr, + uncenter_kmer=args.uncenter_kmer) if args.store_example_reads: - np.savez(npz_path + splitext(basename(file))[0], base_labels=tr.events, raw=tr.raw) + np.savez(npz_path + splitext(basename(file))[0], + base_labels=tr.events, raw=tr.raw) if not i+1 % 10: # Every 10 reads remove history of transactions ('pack' the database) to reduce size db.pack_db() if db.nb_pos > args.max_nb_examples: diff --git a/trash_debug_CNN.py b/trash_debug_CNN.py new file mode 100644 index 0000000..a3280f8 --- /dev/null +++ b/trash_debug_CNN.py @@ -0,0 +1,65 @@ +import h5py +from db_building.TrainingRead import Read, TrainingRead +from pathlib import Path +from inference.InferenceModel import InferenceModel +from nns.Cnn_test import NeuralNetwork + +def main(): + # Hunt for positive read + input_length = 1000 + target_kmer = 'AGGAGAGT' + + # for file in Path('/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_for_initial_training/test').iterdir(): + # print(f'Scanning {file}') + # with h5py.File(file, 'r') as h5_file: + # try: + # train_read = TrainingRead(h5_file, 'median', + # 'Analyses/RawGenomeCorrected_000', 8) + # if [i for i in train_read.condensed_events + # if i[0] == target_kmer]: + # print(f"found in {file}") + # break + # except KeyError as e: + # print('Got keyerror, continuing') + # + # return + + pos_read_path = Path( + '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_for_initial_training/test/L0144169_20181212_FAK22428_MN19628_sequencing_run_16Srhizhome_2_99947_read_129882_ch_413_strand.fast5' + ) + + neg_read_path = Path( + '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_for_initial_training/test/L0144169_20181212_FAK22428_MN19628_sequencing_run_16Srhizhome_2_99947_read_34522_ch_344_strand.fast5') + with h5py.File(pos_read_path, 'r') as f: + pos_read = Read(f, 'median') + pos_train_read = TrainingRead(f, 'median', + 'Analyses/RawGenomeCorrected_000', 8) + + with h5py.File(neg_read_path, 'r') as f: + neg_read = Read(f, 'median') + neg_train_read = TrainingRead(f, 'median', + 'Analyses/RawGenomeCorrected_000', 8) + + split_pos_read = pos_read.get_split_raw_read(input_length, stride=10) + split_neg_read = neg_read.get_split_raw_read(input_length, stride=10) + split_pos_train_read = pos_train_read.get_split_raw_read(input_length, + stride=10) + split_neg_train_read = neg_train_read.get_split_raw_read(input_length, + stride=10) + + # compiled_model = InferenceModel('/home/noord087/lustre_link/mscthesis/' + # 'baseless/baseless_2_on_16s/out_model_test' + # '.tar') + + cnn = NeuralNetwork(target=target_kmer, weights='/home/noord087/lustre_link/mscthesis/baseless/baseless_2_on_16s/nns/AGGAGAGT/nn.h5') + + true_idx = [ce for ce in pos_train_read.condensed_events + if ce[0] == target_kmer] + cnn.predict(split_pos_read, target_kmer) + cnn.predict(split_neg_read, target_kmer) + cnn.predict(split_pos_train_read, target_kmer) + cnn.predict(split_neg_train_read, target_kmer) + + +if __name__ == '__main__': + main() \ No newline at end of file -- GitLab