Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
baseLess
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lannoy, Carlos de
baseLess
Commits
c3e2abd2
Commit
c3e2abd2
authored
3 years ago
by
Carlos de Lannoy
Browse files
Options
Downloads
Patches
Plain Diff
limit nb examples in dbs for validation
parent
3bd319d0
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
argparse_dicts.py
+12
-1
12 additions, 1 deletion
argparse_dicts.py
db_building/ExampleDb.py
+15
-12
15 additions, 12 deletions
db_building/ExampleDb.py
db_building/build_db.py
+5
-0
5 additions, 0 deletions
db_building/build_db.py
validation/validate_16S.sf
+6
-8
6 additions, 8 deletions
validation/validate_16S.sf
with
38 additions
and
21 deletions
argparse_dicts.py
+
12
−
1
View file @
c3e2abd2
...
...
@@ -148,6 +148,12 @@ db_dir = ('--db-dir', {
# --- parameters ---
max_nb_examples
=
(
'
--max-nb-examples
'
,
{
'
type
'
:
int
,
'
default
'
:
10000
,
'
help
'
:
'
Maximum number of examples to store in DB [default: 10000]
'
})
store_example_reads
=
(
'
--store-example-reads
'
,{
'
action
'
:
'
store_true
'
,
'
help
'
:
'
Additionally store reads as npzs
'
...
...
@@ -240,8 +246,13 @@ def get_run_production_pipeline_parser():
def
get_build_db_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
'
Create ZODB database of training reads from resquiggled fast5s,
'
'
for given target k-mer
'
)
randomize
=
(
'
--randomize
'
,
{
'
action
'
:
'
store_true
'
,
'
help
'
:
'
Randomize read order before adding examples
'
})
for
arg
in
(
fast5_in
,
db_dir
,
normalization
,
target
,
width
,
hdf_path
,
uncenter_kmer
,
read_index
,
db_type
,
silent
,
store_example_reads
):
uncenter_kmer
,
read_index
,
db_type
,
silent
,
store_example_reads
,
max_nb_examples
,
randomize
):
parser
.
add_argument
(
arg
[
0
],
**
arg
[
1
])
return
parser
...
...
This diff is collapsed.
Click to expand it.
db_building/ExampleDb.py
+
15
−
12
View file @
c3e2abd2
...
...
@@ -12,12 +12,14 @@ class ExampleDb(object):
self
.
_db
=
None
self
.
neg_kmers
=
dict
()
# Dict of lists, indices per encountered negative example k-mer
self
.
_db_empty
=
True
self
.
nb_pos
=
0
self
.
nb_neg
=
0
if
not
isfile
(
kwargs
[
'
db_name
'
]):
self
.
target
=
kwargs
[
'
target
'
]
self
.
width
=
kwargs
[
'
width
'
]
self
.
db_name
=
kwargs
[
'
db_name
'
]
self
.
read_only
=
kwargs
.
get
(
'
read_only
'
,
False
)
self
.
db
=
self
.
db_name
...
...
@@ -33,16 +35,17 @@ class ExampleDb(object):
:type uncenter_kmer: bool
"""
with
self
.
_db
.
transaction
()
as
conn
:
pos_examples
=
training_read
.
get_pos
(
self
.
target
,
self
.
width
,
uncenter_kmer
)
for
i
,
ex
in
enumerate
(
pos_examples
):
conn
.
root
.
pos
[
len
(
conn
.
root
.
pos
)]
=
ex
# conn.root.pos[self.nb_pos+i] = ex
# self.nb_pos = conn.root.pos.maxKey()
# TODO: arbitrarily adding 5x as much neg examples
neg_examples
,
neg_kmers
=
training_read
.
get_neg
(
self
.
target
,
self
.
width
,
len
(
pos_examples
)
*
5
)
# --- add positive examples (if any) ---
pos_examples
=
training_read
.
get_pos
(
self
.
target
,
self
.
width
,
uncenter_kmer
)
for
i
,
ex
in
enumerate
(
pos_examples
):
conn
.
root
.
pos
[
len
(
conn
.
root
.
pos
)]
=
ex
# --- update record nb positive examples ---
if
self
.
_db_empty
:
if
len
(
pos_examples
):
self
.
_db_empty
=
False
if
not
self
.
_db_empty
:
self
.
nb_pos
=
conn
.
root
.
pos
.
maxKey
()
# --- add negative examples ---
neg_examples
,
neg_kmers
=
training_read
.
get_neg
(
self
.
target
,
self
.
width
,
len
(
pos_examples
)
*
5
)
# arbitrarily adding 5x as much neg examples
for
i
,
ex
in
enumerate
(
neg_examples
):
if
neg_kmers
[
i
]
in
self
.
neg_kmers
:
self
.
neg_kmers
[
neg_kmers
[
i
]].
append
(
self
.
nb_neg
+
i
)
...
...
@@ -112,8 +115,6 @@ class ExampleDb(object):
:param db_name: name of new db, including path
"""
is_existing_db
=
isfile
(
db_name
)
# if is_existing_db:
# print("Opening db {db_name}".format(db_name=db_name))
storage
=
ZODB
.
FileStorage
.
FileStorage
(
db_name
,
read_only
=
self
.
read_only
)
self
.
_db
=
ZODB
.
DB
(
storage
)
if
is_existing_db
:
...
...
@@ -129,3 +130,5 @@ class ExampleDb(object):
conn
.
root
.
width
=
self
.
width
conn
.
root
.
pos
=
BTrees
.
IOBTree
.
BTree
()
conn
.
root
.
neg
=
BTrees
.
IOBTree
.
BTree
()
if
self
.
nb_pos
>
0
:
self
.
_db_empty
=
False
This diff is collapsed.
Click to expand it.
db_building/build_db.py
+
5
−
0
View file @
c3e2abd2
...
...
@@ -5,6 +5,7 @@ import h5py
from
os.path
import
isdir
,
dirname
,
basename
,
splitext
from
shutil
import
rmtree
from
pathlib
import
Path
from
random
import
shuffle
__location__
=
dirname
(
Path
(
__file__
).
resolve
())
sys
.
path
.
extend
([
__location__
,
f
'
{
__location__
}
/..
'
])
...
...
@@ -25,6 +26,7 @@ def main(args):
file_list
=
list
(
read_index_df
.
query
(
f
'
fold == False
'
).
fn
)
else
:
file_list
=
parse_input_path
(
args
.
fast5_in
,
pattern
=
'
*.fast5
'
)
if
args
.
randomize
:
shuffle
(
file_list
)
db_name
=
out_path
+
'
db.fs
'
error_fn
=
out_path
+
'
failed_reads.txt
'
npz_path
=
out_path
+
'
test_squiggles/
'
...
...
@@ -47,6 +49,9 @@ def main(args):
np
.
savez
(
npz_path
+
splitext
(
basename
(
file
))[
0
],
base_labels
=
tr
.
events
,
raw
=
tr
.
raw
)
if
not
i
+
1
%
10
:
# Every 10 reads remove history of transactions ('pack' the database) to reduce size
db
.
pack_db
()
if
db
.
nb_pos
>
args
.
max_nb_examples
:
print
(
'
Max number of examples reached
'
)
break
percentage_processed
=
int
(
(
i
+
1
)
/
nb_files
*
100
)
if
not
args
.
silent
and
percentage_processed
>=
count_pct_lim
:
print
(
f
'
{
percentage_processed
}
% of reads processed,
{
db
.
nb_pos
}
positives in DB
'
)
...
...
This diff is collapsed.
Click to expand it.
validation/validate_16S.sf
+
6
−
8
View file @
c3e2abd2
...
...
@@ -131,10 +131,9 @@ rule generate_training_db:
--target {wildcards.target} \
--width {filter_width} \
--hdf-path {hdf_path} \
--silent \
{% if uncenter_kmer %}
--uncenter-kmer \
{% endif %}
--randomize \
--silent{% if uncenter_kmer %} --uncenter-kmer {% endif %} \
--max-nb-examples 10000 \
&> {logs_dir}db_train_{wildcards.target}.log
"""
...
...
@@ -157,9 +156,8 @@ rule generate_test_db:
--target {wildcards.target} \
--width {filter_width} \
--hdf-path {hdf_path} \
--silent \
{% if uncenter_kmer %}
--uncenter-kmer \
{% endif %}
--randomize \
--silent{% if uncenter_kmer %} --uncenter-kmer {% endif %} \
--max-nb-examples 1000 \
&> {logs_dir}db_test_{wildcards.target}.log
"""
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment