Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
baseLess
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lannoy, Carlos de
baseLess
Commits
b4da7db6
Commit
b4da7db6
authored
3 years ago
by
Noordijk, Ben
Browse files
Options
Downloads
Patches
Plain Diff
Some refactoring and CNN parameter improvements also merged master
parent
4ae91e05
No related branches found
Branches containing commit
No related tags found
1 merge request
!1
All MSc thesis work so far, mainly on CNNs, K-mer design and data preprocessing
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
CnnParameterFile.yaml
+5
-5
5 additions, 5 deletions
CnnParameterFile.yaml
nns/Cnn_test.py
+37
-33
37 additions, 33 deletions
nns/Cnn_test.py
with
42 additions
and
38 deletions
CnnParameterFile.yaml
+
5
−
5
View file @
b4da7db6
#########################################
#
BIDIRECTIONAL RECURRENT
NEURAL NETWORK#
#
CONVOLUTIONAL
NEURAL NETWORK
#
# PARAMETER FILE #
#########################################
...
...
@@ -7,12 +7,12 @@
nn_class
:
Cnn_test
batch_size
:
32
eps_per_kmer_switch
:
20
max_sequence_length
:
10
00
# Only for example reads
filter_width
:
10
0
max_sequence_length
:
5
00
# Only for example reads
kernel_size
:
10
threshold
:
0.95
num_batches
:
320
learning_rate
:
0.
0
01
num_kmer_switches
:
1
0
learning_rate
:
0.01
num_kmer_switches
:
1
dropout_keep_prob
:
0.5
This diff is collapsed.
Click to expand it.
nns/Cnn_test.py
+
37
−
33
View file @
b4da7db6
...
...
@@ -12,22 +12,23 @@ class NeuralNetwork(object):
:param target: The target k-mer that the NN should recognise
:type target: str
:param
filter_width: TODO what does this do? Like k
ernel size
in
CNN
:type
filter_width
: int
:param
kernel_size: K
ernel size
of
CNN
:type
kernel_size
: int
:param weights: Initial weights to use for the neural network
:param max_sequence_length: Length to which pad or truncate the sequences
:type max_sequence_length: int
:param batch_size: Batch size to use during training
:param threshold: Assign label to TRUE if probability above this threshold
:param eps_per_kmer_switch: Number of epochs to run during training(?)
when doing prediction
:param eps_per_kmer_switch: Number of epochs to run
"""
def
__init__
(
self
,
**
kwargs
):
# Ensure these attributes are in. Additional attributes may be defined
self
.
target
=
kwargs
[
'
target
'
]
self
.
filter_width
=
kwargs
[
'
filter_width
'
]
self
.
hfw
=
(
self
.
filter_width
-
1
)
//
2
# half filter width
self
.
kernel_size
=
kwargs
[
'
kernel_size
'
]
self
.
hfw
=
(
self
.
kernel_size
-
1
)
//
2
# half filter width
self
.
max_sequence_length
=
kwargs
[
'
max_sequence_length
'
]
self
.
batch_size
=
kwargs
[
'
batch_size
'
]
self
.
threshold
=
kwargs
[
'
threshold
'
]
...
...
@@ -44,8 +45,8 @@ class NeuralNetwork(object):
"""
self
.
model
=
models
.
Sequential
()
self
.
model
.
add
(
layers
.
Conv1D
(
5
,
kernel_size
=
10
,
activation
=
'
relu
'
,
input_shape
=
(
1000
,
1
)))
self
.
model
.
add
(
layers
.
Conv1D
(
5
,
kernel_size
=
self
.
kernel_size
,
activation
=
'
relu
'
,
input_shape
=
(
self
.
max_sequence_length
,
1
)))
# self.model.add(layers.MaxPool1D(2)) # Might use this later
self
.
model
.
add
(
layers
.
Flatten
())
self
.
model
.
add
(
layers
.
Dense
(
1
,
activation
=
'
sigmoid
'
))
...
...
@@ -54,11 +55,15 @@ class NeuralNetwork(object):
metrics
=
[
'
BinaryAccuracy
'
,
'
Precision
'
,
'
Recall
'
])
def
train
(
self
,
x
,
y
,
x_val
,
y_val
,
eps_per_kmer_switch
=
100
):
"""
Train the network. x_val/y_val may be used for validation/early
"""
Train the network. x_val/y_val may be used for validation/early
stopping mechanisms.
:param x: Input reads
:param y: Ground truth labels of the read
:param x_val: Input reads to use for validation
:param y_val: Ground truth reads to use for validation
"""
# Pad input sequences
TODO why post padding and truncating? Would doing it at the start work too?
# Pad input sequences
x_pad
=
np
.
expand_dims
(
pad_sequences
(
x
,
maxlen
=
self
.
max_sequence_length
,
padding
=
'
post
'
,
truncating
=
'
post
'
),
-
1
)
x_val_pad
=
np
.
expand_dims
(
pad_sequences
(
x_val
,
maxlen
=
self
.
max_sequence_length
,
...
...
@@ -69,29 +74,28 @@ class NeuralNetwork(object):
reshuffle_each_iteration
=
True
)
# Train the model
self
.
model
.
fit
(
tfd
,
epochs
=
self
.
eps_per_kmer_switch
,
validation_data
=
(
x_val_pad
,
y_val
))
\
# , validation_steps=10,
# validation_freq=max(self.eps_per_kmer_switch,
# self.eps_per_kmer_switch // 10))
for
hv
in
self
.
model
.
history
.
history
:
# TODO what does this do?
validation_data
=
(
x_val_pad
,
y_val
))
for
hv
in
self
.
model
.
history
.
history
:
self
.
history
[
hv
].
extend
(
self
.
model
.
history
.
history
[
hv
])
def
predict
(
self
,
x
,
clean_signal
=
True
,
return_probs
=
False
):
offset
=
5
ho
=
offset
//
2
lb
,
rb
=
self
.
hfw
-
ho
,
self
.
hfw
+
ho
+
1
idx
=
np
.
arange
(
self
.
filter_width
,
len
(
x
)
+
offset
,
offset
)
x_batched
=
[
x
[
si
:
ei
]
for
si
,
ei
in
zip
(
idx
-
100
,
idx
)]
x_pad
=
pad_sequences
(
x_batched
,
padding
=
'
post
'
,
dtype
=
'
float32
'
)
posteriors
=
self
.
model
.
predict
(
x_pad
)
y_hat
=
posteriors
>
self
.
threshold
y_out
=
np
.
zeros
(
len
(
x
),
dtype
=
int
)
for
i
,
yh
in
enumerate
(
y_hat
):
y_out
[
lb
+
i
*
offset
:
rb
+
i
*
offset
]
=
yh
# todo include clean signal
if
return_probs
:
posteriors_out
=
np
.
zeros
(
len
(
x
),
dtype
=
float
)
for
i
,
p
in
enumerate
(
posteriors
):
posteriors_out
[
lb
+
i
*
offset
:
rb
+
i
*
offset
]
=
p
return
y_out
,
posteriors_out
return
y_out
# TODO look into this
pass
# offset = 5
# ho = offset // 2
# lb, rb = self.hfw - ho, self.hfw + ho + 1
# idx = np.arange(self.kernel_size, len(x) + offset, offset)
# x_batched = [x[si:ei] for si, ei in zip(idx-100, idx)]
# x_pad = pad_sequences(x_batched, padding='post', dtype='float32')
# posteriors = self.model.predict(x_pad)
# y_hat = posteriors > self.threshold
# y_out = np.zeros(len(x), dtype=int)
# for i, yh in enumerate(y_hat):
# y_out[lb + i * offset :rb + i * offset] = yh
# # todo include clean signal
# if return_probs:
# posteriors_out = np.zeros(len(x), dtype=float)
# for i, p in enumerate(posteriors):
# posteriors_out[lb + i * offset :rb + i * offset] = p
# return y_out, posteriors_out
# return y_out
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment