Some refactoring and CNN parameter improvements also merged master

b4da7db6 · Noordijk, Ben · 4ae91e05 · b4da7db6 · b4da7db6
Commit b4da7db6 authored 3 years ago by Noordijk, Ben
--- a/CnnParameterFile.yaml
+++ b/CnnParameterFile.yaml
 #########################################
-# BIDIRECTIONAL RECURRENT NEURAL NETWORK#
+#      CONVOLUTIONAL NEURAL NETWORK     #
 #            PARAMETER FILE             #
 #########################################

@@ -7,12 +7,12 @@
 nn_class: Cnn_test
 batch_size: 32
 eps_per_kmer_switch: 20
-max_sequence_length: 1000  # Only for example reads
-filter_width: 100
+max_sequence_length: 500  # Only for example reads
+kernel_size: 10
 threshold: 0.95
 num_batches: 320
-learning_rate: 0.001
-num_kmer_switches: 10
+learning_rate: 0.01
+num_kmer_switches: 1
 dropout_keep_prob: 0.5


--- a/nns/Cnn_test.py
+++ b/nns/Cnn_test.py
@@ -12,22 +12,23 @@ class NeuralNetwork(object):

    :param target: The target k-mer that the NN should recognise
    :type target: str
-    :param filter_width: TODO what does this do? Like kernel size in CNN
-    :type filter_width: int
+    :param kernel_size: Kernel size of CNN
+    :type kernel_size: int
    :param weights: Initial weights to use for the neural network
    :param max_sequence_length: Length to which pad or truncate the sequences
    :type max_sequence_length: int
    :param batch_size: Batch size to use during training
    :param threshold: Assign label to TRUE if probability above this threshold
-    :param eps_per_kmer_switch: Number of epochs to run during training(?)
+                      when doing prediction
+    :param eps_per_kmer_switch: Number of epochs to run
    """

    def __init__(self, **kwargs):
        # Ensure these attributes are in. Additional attributes may be defined

        self.target = kwargs['target']
-        self.filter_width = kwargs['filter_width']
-        self.hfw = (self.filter_width - 1) // 2  # half filter width
+        self.kernel_size = kwargs['kernel_size']
+        self.hfw = (self.kernel_size - 1) // 2  # half filter width
        self.max_sequence_length = kwargs['max_sequence_length']
        self.batch_size = kwargs['batch_size']
        self.threshold = kwargs['threshold']
@@ -44,8 +45,8 @@ class NeuralNetwork(object):
        """

        self.model = models.Sequential()
-        self.model.add(layers.Conv1D(5, kernel_size=10, activation='relu',
-                                     input_shape=(1000, 1)))
+        self.model.add(layers.Conv1D(5, kernel_size=self.kernel_size, activation='relu',
+                                     input_shape=(self.max_sequence_length, 1)))
        # self.model.add(layers.MaxPool1D(2))  # Might use this later
        self.model.add(layers.Flatten())
        self.model.add(layers.Dense(1, activation='sigmoid'))
@@ -54,11 +55,15 @@ class NeuralNetwork(object):
                           metrics=['BinaryAccuracy', 'Precision', 'Recall'])

    def train(self, x, y, x_val, y_val, eps_per_kmer_switch=100):
-        """
-        Train the network. x_val/y_val may be used for validation/early
+        """Train the network. x_val/y_val may be used for validation/early
        stopping mechanisms.
+
+        :param x: Input reads
+        :param y: Ground truth labels of the read
+        :param x_val: Input reads to use for validation
+        :param y_val: Ground truth reads to use for validation
        """
-        # Pad input sequences TODO why post padding and truncating? Would doing it at the start work too?
+        # Pad input sequences
        x_pad = np.expand_dims(pad_sequences(x, maxlen=self.max_sequence_length,
                              padding='post', truncating='post'), -1)
        x_val_pad = np.expand_dims(pad_sequences(x_val, maxlen=self.max_sequence_length,
@@ -69,29 +74,28 @@ class NeuralNetwork(object):
                                       reshuffle_each_iteration=True)
        # Train the model
        self.model.fit(tfd, epochs=self.eps_per_kmer_switch,
-                       validation_data=(x_val_pad, y_val))\
-        # , validation_steps=10,
-        #            validation_freq=max(self.eps_per_kmer_switch,
-        #                                self.eps_per_kmer_switch // 10))
-        for hv in self.model.history.history:  # TODO what does this do?
+                       validation_data=(x_val_pad, y_val))
+        for hv in self.model.history.history:
            self.history[hv].extend(self.model.history.history[hv])

    def predict(self, x, clean_signal=True, return_probs=False):
-        offset = 5
-        ho = offset // 2
-        lb, rb = self.hfw - ho, self.hfw + ho + 1
-        idx = np.arange(self.filter_width, len(x) + offset, offset)
-        x_batched = [x[si:ei] for si, ei in zip(idx-100, idx)]
-        x_pad = pad_sequences(x_batched, padding='post', dtype='float32')
-        posteriors = self.model.predict(x_pad)
-        y_hat = posteriors > self.threshold
-        y_out = np.zeros(len(x), dtype=int)
-        for i, yh in enumerate(y_hat):
-            y_out[lb + i * offset :rb + i * offset] = yh
-        # todo include clean signal
-        if return_probs:
-            posteriors_out = np.zeros(len(x), dtype=float)
-            for i, p in enumerate(posteriors):
-                posteriors_out[lb + i * offset :rb + i * offset] = p
-            return y_out, posteriors_out
-        return y_out
+        # TODO look into this
+        pass
+        # offset = 5
+        # ho = offset // 2
+        # lb, rb = self.hfw - ho, self.hfw + ho + 1
+        # idx = np.arange(self.kernel_size, len(x) + offset, offset)
+        # x_batched = [x[si:ei] for si, ei in zip(idx-100, idx)]
+        # x_pad = pad_sequences(x_batched, padding='post', dtype='float32')
+        # posteriors = self.model.predict(x_pad)
+        # y_hat = posteriors > self.threshold
+        # y_out = np.zeros(len(x), dtype=int)
+        # for i, yh in enumerate(y_hat):
+        #     y_out[lb + i * offset :rb + i * offset] = yh
+        # # todo include clean signal
+        # if return_probs:
+        #     posteriors_out = np.zeros(len(x), dtype=float)
+        #     for i, p in enumerate(posteriors):
+        #         posteriors_out[lb + i * offset :rb + i * offset] = p
+        #     return y_out, posteriors_out
+        # return y_out