Skip to content
Snippets Groups Projects
Commit b4da7db6 authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

Some refactoring and CNN parameter improvements also merged master

parent 4ae91e05
No related branches found
No related tags found
1 merge request!1All MSc thesis work so far, mainly on CNNs, K-mer design and data preprocessing
#########################################
# BIDIRECTIONAL RECURRENT NEURAL NETWORK#
# CONVOLUTIONAL NEURAL NETWORK #
# PARAMETER FILE #
#########################################
......@@ -7,12 +7,12 @@
nn_class: Cnn_test
batch_size: 32
eps_per_kmer_switch: 20
max_sequence_length: 1000 # Only for example reads
filter_width: 100
max_sequence_length: 500 # Only for example reads
kernel_size: 10
threshold: 0.95
num_batches: 320
learning_rate: 0.001
num_kmer_switches: 10
learning_rate: 0.01
num_kmer_switches: 1
dropout_keep_prob: 0.5
......@@ -12,22 +12,23 @@ class NeuralNetwork(object):
:param target: The target k-mer that the NN should recognise
:type target: str
:param filter_width: TODO what does this do? Like kernel size in CNN
:type filter_width: int
:param kernel_size: Kernel size of CNN
:type kernel_size: int
:param weights: Initial weights to use for the neural network
:param max_sequence_length: Length to which pad or truncate the sequences
:type max_sequence_length: int
:param batch_size: Batch size to use during training
:param threshold: Assign label to TRUE if probability above this threshold
:param eps_per_kmer_switch: Number of epochs to run during training(?)
when doing prediction
:param eps_per_kmer_switch: Number of epochs to run
"""
def __init__(self, **kwargs):
# Ensure these attributes are in. Additional attributes may be defined
self.target = kwargs['target']
self.filter_width = kwargs['filter_width']
self.hfw = (self.filter_width - 1) // 2 # half filter width
self.kernel_size = kwargs['kernel_size']
self.hfw = (self.kernel_size - 1) // 2 # half filter width
self.max_sequence_length = kwargs['max_sequence_length']
self.batch_size = kwargs['batch_size']
self.threshold = kwargs['threshold']
......@@ -44,8 +45,8 @@ class NeuralNetwork(object):
"""
self.model = models.Sequential()
self.model.add(layers.Conv1D(5, kernel_size=10, activation='relu',
input_shape=(1000, 1)))
self.model.add(layers.Conv1D(5, kernel_size=self.kernel_size, activation='relu',
input_shape=(self.max_sequence_length, 1)))
# self.model.add(layers.MaxPool1D(2)) # Might use this later
self.model.add(layers.Flatten())
self.model.add(layers.Dense(1, activation='sigmoid'))
......@@ -54,11 +55,15 @@ class NeuralNetwork(object):
metrics=['BinaryAccuracy', 'Precision', 'Recall'])
def train(self, x, y, x_val, y_val, eps_per_kmer_switch=100):
"""
Train the network. x_val/y_val may be used for validation/early
"""Train the network. x_val/y_val may be used for validation/early
stopping mechanisms.
:param x: Input reads
:param y: Ground truth labels of the read
:param x_val: Input reads to use for validation
:param y_val: Ground truth reads to use for validation
"""
# Pad input sequences TODO why post padding and truncating? Would doing it at the start work too?
# Pad input sequences
x_pad = np.expand_dims(pad_sequences(x, maxlen=self.max_sequence_length,
padding='post', truncating='post'), -1)
x_val_pad = np.expand_dims(pad_sequences(x_val, maxlen=self.max_sequence_length,
......@@ -69,29 +74,28 @@ class NeuralNetwork(object):
reshuffle_each_iteration=True)
# Train the model
self.model.fit(tfd, epochs=self.eps_per_kmer_switch,
validation_data=(x_val_pad, y_val))\
# , validation_steps=10,
# validation_freq=max(self.eps_per_kmer_switch,
# self.eps_per_kmer_switch // 10))
for hv in self.model.history.history: # TODO what does this do?
validation_data=(x_val_pad, y_val))
for hv in self.model.history.history:
self.history[hv].extend(self.model.history.history[hv])
def predict(self, x, clean_signal=True, return_probs=False):
offset = 5
ho = offset // 2
lb, rb = self.hfw - ho, self.hfw + ho + 1
idx = np.arange(self.filter_width, len(x) + offset, offset)
x_batched = [x[si:ei] for si, ei in zip(idx-100, idx)]
x_pad = pad_sequences(x_batched, padding='post', dtype='float32')
posteriors = self.model.predict(x_pad)
y_hat = posteriors > self.threshold
y_out = np.zeros(len(x), dtype=int)
for i, yh in enumerate(y_hat):
y_out[lb + i * offset :rb + i * offset] = yh
# todo include clean signal
if return_probs:
posteriors_out = np.zeros(len(x), dtype=float)
for i, p in enumerate(posteriors):
posteriors_out[lb + i * offset :rb + i * offset] = p
return y_out, posteriors_out
return y_out
# TODO look into this
pass
# offset = 5
# ho = offset // 2
# lb, rb = self.hfw - ho, self.hfw + ho + 1
# idx = np.arange(self.kernel_size, len(x) + offset, offset)
# x_batched = [x[si:ei] for si, ei in zip(idx-100, idx)]
# x_pad = pad_sequences(x_batched, padding='post', dtype='float32')
# posteriors = self.model.predict(x_pad)
# y_hat = posteriors > self.threshold
# y_out = np.zeros(len(x), dtype=int)
# for i, yh in enumerate(y_hat):
# y_out[lb + i * offset :rb + i * offset] = yh
# # todo include clean signal
# if return_probs:
# posteriors_out = np.zeros(len(x), dtype=float)
# for i, p in enumerate(posteriors):
# posteriors_out[lb + i * offset :rb + i * offset] = p
# return y_out, posteriors_out
# return y_out
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment