Skip to content
Snippets Groups Projects
Commit c3e2abd2 authored by Carlos de Lannoy's avatar Carlos de Lannoy
Browse files

limit nb examples in dbs for validation

parent 3bd319d0
No related branches found
No related tags found
No related merge requests found
......@@ -148,6 +148,12 @@ db_dir = ('--db-dir', {
# --- parameters ---
max_nb_examples = ('--max-nb-examples', {
'type': int,
'default': 10000,
'help': 'Maximum number of examples to store in DB [default: 10000]'
})
store_example_reads = ('--store-example-reads',{
'action': 'store_true',
'help': 'Additionally store reads as npzs'
......@@ -240,8 +246,13 @@ def get_run_production_pipeline_parser():
def get_build_db_parser():
parser = argparse.ArgumentParser(description='Create ZODB database of training reads from resquiggled fast5s, '
'for given target k-mer')
randomize = ('--randomize', {
'action': 'store_true',
'help': 'Randomize read order before adding examples'
})
for arg in (fast5_in, db_dir, normalization, target, width, hdf_path,
uncenter_kmer, read_index, db_type, silent, store_example_reads):
uncenter_kmer, read_index, db_type, silent, store_example_reads,
max_nb_examples, randomize):
parser.add_argument(arg[0], **arg[1])
return parser
......
......@@ -12,12 +12,14 @@ class ExampleDb(object):
self._db = None
self.neg_kmers = dict() # Dict of lists, indices per encountered negative example k-mer
self._db_empty = True
self.nb_pos = 0
self.nb_neg = 0
if not isfile(kwargs['db_name']):
self.target = kwargs['target']
self.width = kwargs['width']
self.db_name = kwargs['db_name']
self.read_only = kwargs.get('read_only', False)
self.db = self.db_name
......@@ -33,16 +35,17 @@ class ExampleDb(object):
:type uncenter_kmer: bool
"""
with self._db.transaction() as conn:
pos_examples = training_read.get_pos(self.target, self.width,
uncenter_kmer)
for i, ex in enumerate(pos_examples):
conn.root.pos[len(conn.root.pos)] = ex
# conn.root.pos[self.nb_pos+i] = ex
# self.nb_pos = conn.root.pos.maxKey()
# TODO: arbitrarily adding 5x as much neg examples
neg_examples, neg_kmers = training_read.get_neg(self.target,
self.width,
len(pos_examples) * 5)
# --- add positive examples (if any) ---
pos_examples = training_read.get_pos(self.target, self.width, uncenter_kmer)
for i, ex in enumerate(pos_examples): conn.root.pos[len(conn.root.pos)] = ex
# --- update record nb positive examples ---
if self._db_empty:
if len(pos_examples):
self._db_empty = False
if not self._db_empty:
self.nb_pos = conn.root.pos.maxKey()
# --- add negative examples ---
neg_examples, neg_kmers = training_read.get_neg(self.target, self.width, len(pos_examples) * 5) # arbitrarily adding 5x as much neg examples
for i, ex in enumerate(neg_examples):
if neg_kmers[i] in self.neg_kmers:
self.neg_kmers[neg_kmers[i]].append(self.nb_neg + i)
......@@ -112,8 +115,6 @@ class ExampleDb(object):
:param db_name: name of new db, including path
"""
is_existing_db = isfile(db_name)
# if is_existing_db:
# print("Opening db {db_name}".format(db_name=db_name))
storage = ZODB.FileStorage.FileStorage(db_name, read_only=self.read_only)
self._db = ZODB.DB(storage)
if is_existing_db:
......@@ -129,3 +130,5 @@ class ExampleDb(object):
conn.root.width = self.width
conn.root.pos = BTrees.IOBTree.BTree()
conn.root.neg = BTrees.IOBTree.BTree()
if self.nb_pos > 0:
self._db_empty = False
......@@ -5,6 +5,7 @@ import h5py
from os.path import isdir, dirname, basename, splitext
from shutil import rmtree
from pathlib import Path
from random import shuffle
__location__ = dirname(Path(__file__).resolve())
sys.path.extend([__location__, f'{__location__}/..'])
......@@ -25,6 +26,7 @@ def main(args):
file_list = list(read_index_df.query(f'fold == False').fn)
else:
file_list = parse_input_path(args.fast5_in, pattern='*.fast5')
if args.randomize: shuffle(file_list)
db_name = out_path+'db.fs'
error_fn = out_path+'failed_reads.txt'
npz_path = out_path + 'test_squiggles/'
......@@ -47,6 +49,9 @@ def main(args):
np.savez(npz_path + splitext(basename(file))[0], base_labels=tr.events, raw=tr.raw)
if not i+1 % 10: # Every 10 reads remove history of transactions ('pack' the database) to reduce size
db.pack_db()
if db.nb_pos > args.max_nb_examples:
print('Max number of examples reached')
break
percentage_processed = int( (i+1) / nb_files * 100)
if not args.silent and percentage_processed >= count_pct_lim:
print(f'{percentage_processed}% of reads processed, {db.nb_pos} positives in DB')
......
......@@ -131,10 +131,9 @@ rule generate_training_db:
--target {wildcards.target} \
--width {filter_width} \
--hdf-path {hdf_path} \
--silent \
{% if uncenter_kmer %}
--uncenter-kmer \
{% endif %}
--randomize \
--silent{% if uncenter_kmer %} --uncenter-kmer {% endif %} \
--max-nb-examples 10000 \
&> {logs_dir}db_train_{wildcards.target}.log
"""
......@@ -157,9 +156,8 @@ rule generate_test_db:
--target {wildcards.target} \
--width {filter_width} \
--hdf-path {hdf_path} \
--silent \
{% if uncenter_kmer %}
--uncenter-kmer \
{% endif %}
--randomize \
--silent{% if uncenter_kmer %} --uncenter-kmer {% endif %} \
--max-nb-examples 1000 \
&> {logs_dir}db_test_{wildcards.target}.log
"""
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment