update

9e3f1bca · Carlos de Lannoy · 0c86cbf5 · 9e3f1bca · 9e3f1bca · 9e3f1bca
Commit 9e3f1bca authored 3 years ago by Carlos de Lannoy
--- a/.gitignore
+++ b/.gitignore
+.*
+__pycache__/*
+*/__pycache__/*
+!/.gitignore
--- a/conda_env.yaml
+++ b/conda_env.yaml
--- a/db_building/build_db.py
+++ b/db_building/build_db.py
@@ -27,19 +27,24 @@ def main(args):
    nb_files = len(file_list)
    count_pct_lim = 5
    for i, file in enumerate(file_list):
-        with h5py.File(file, 'r') as f:
-            try:
-                tr = TrainingRead(f, normalization=args.normalization, hdf_path=args.hdf_path, kmer_size=kmer_size)
-                db.add_training_read(training_read=tr)
-            except ValueError as e:
-                with open(error_fn, 'a') as efn:
-                    efn.write('{fn}\t{err}\n'.format(err=e, fn=basename(file)))
-                continue
-        np.savez(npz_path + splitext(basename(file))[0], base_labels=tr.events, raw=tr.raw)
-        if not i+1 % 10:  # Every 10 reads remove history of transactions ('pack' the database) to reduce size
-            db.pack_db()
-        percentage_processed = int( (i+1) / nb_files * 100)
-        if percentage_processed > count_pct_lim:
-            print(f'{percentage_processed}% of reads processed, {db.nb_pos} positives and {db.nb_neg} negatives in DB')
-            count_pct_lim += 5
+        try:
+            with h5py.File(file, 'r') as f:
+                # try:
+                    tr = TrainingRead(f, normalization=args.normalization, hdf_path=args.hdf_path, kmer_size=kmer_size)
+                    db.add_training_read(training_read=tr)
+                # except ValueError as e:
+                #     with open(error_fn, 'a') as efn:
+                #         efn.write('{fn}\t{err}\n'.format(err=e, fn=basename(file)))
+                #     continue
+            np.savez(npz_path + splitext(basename(file))[0], base_labels=tr.events, raw=tr.raw)
+            if not i+1 % 10:  # Every 10 reads remove history of transactions ('pack' the database) to reduce size
+                db.pack_db()
+            percentage_processed = int( (i+1) / nb_files * 100)
+            if percentage_processed > count_pct_lim:
+                print(f'{percentage_processed}% of reads processed, {db.nb_pos} positives and {db.nb_neg} negatives in DB')
+                count_pct_lim += 5
+        except Exception as e:
+            with open(error_fn, 'a') as efn:
+                efn.write('{fn}\t{err}\n'.format(err=e, fn=basename(file)))
+            continue
    db.pack_db()
--- a/env.yaml
+++ b/env.yaml
--- a/hyperparameter_search/hpsearch_kmers_20210619.txt
+++ b/hyperparameter_search/hpsearch_kmers_20210619.txt
--- a/hyperparameter_search/hyperopt_sandbox.py
+++ b/hyperparameter_search/hyperopt_sandbox.py
+from hyperopt import hp
+
+# define an objective function
+def objective(args):
+    case, val = args
+    if case == 'case 1':
+        return val
+    else:
+        return val ** 2
+
+# define a search space
+space = hp.choice('a',
+    [
+        ('case 1', 1 + hp.lognormal('c1', 0, 1)),
+        ('case 2', hp.uniform('c2', -10, 10))
+    ])
+
+# minimize the objective over the space
+from hyperopt import fmin, tpe, space_eval
+best = fmin(objective, space, algo=tpe.suggest, max_evals=100)
+
+print(best)
+print(space_eval(space, best))
--- a/hyperparameter_search/hyperparameter_ranges.yaml
+++ b/hyperparameter_search/hyperparameter_ranges.yaml
--- a/hyperparameter_search/hyperparameter_ranges_test.yaml
+++ b/hyperparameter_search/hyperparameter_ranges_test.yaml
+# VARIABLE PARAMETERS
+variable:
+  layer_size:
+    min: 8
+    max: 16
+    step: 4
+    type: int
+  num_layers:
+    min: 1
+    max: 2
+    step: 1
+    type: int
+  filter_width:
+    min: 30
+    max: 150
+    step: 10
+    type: int
+  batch_size:
+    min: 8
+    max: 16
+    step: 8
+    type: int
+  num_batches:
+    min: 5
+    max: 15
+    step: 10
+    type: int
+  num_kmer_switches:
+    min: 2
+    max: 3
+    step: 1
+    type: int
+  eps_per_kmer_switch:
+    min: 1
+    max: 5
+    step: 1
+    type: int
+  learning_rate:
+    min: 0.0001
+    max: 0.01
+    step: 0.0001
+    type: float
+  threshold:
+    min: 0.5
+    max: 0.95
+    step: 0.05
+    type: float
+nonvariable:
+  max_sequence_length: 1000
+  dropout_keep_prob: 0.5
+  cell_type: GRU
+  name_optimizer: adam
--- a/hyperparameter_search/optimize_hyperparams.py
+++ b/hyperparameter_search/optimize_hyperparams.py
--- a/run_production_pipeline.py
+++ b/run_production_pipeline.py
--- a/run_production_pipeline.sf
+++ b/run_production_pipeline.sf
--- a/tools/plot_nn_performance.py
+++ b/tools/plot_nn_performance.py