From 500c71106332a6a51cf1c0a9a470ae31c821bd28 Mon Sep 17 00:00:00 2001
From: Ben Noordijk <ben.noordijk@wur.nl>
Date: Tue, 9 Nov 2021 17:20:09 +0100
Subject: [PATCH] Raw read now also cuts off first part of initial raw signal

---
 db_building/TrainingRead.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/db_building/TrainingRead.py b/db_building/TrainingRead.py
index c6d6b3e..2809b60 100755
--- a/db_building/TrainingRead.py
+++ b/db_building/TrainingRead.py
@@ -7,7 +7,6 @@ from random import choice
 from persistent import Persistent
 from pathlib import Path
 from helper_functions import normalize_raw_signal
-import math
 sys.path.append(f'{dirname(Path(__file__).resolve())}/..')
 
 
@@ -32,13 +31,16 @@ class Read:
     def raw(self, _):
         raw_varname = self.hdf['Raw/Reads/'].visit(str)
         raw = self.hdf[f'Raw/Reads/{raw_varname}/Signal'][()]
+        # Cutoff first 1500 signals to be consistent with
+        # TrainingRead and its normalization
+        raw = raw[1500:]
         self._raw = normalize_raw_signal(raw, self.normalization)
 
     def get_split_raw_read(self, length, stride=5):
         """Split into parts of certain length, last split may be shorter"""
         nrows = ((self.raw.size - length) // stride) + 1
         n = self.raw.strides[0]
-        raw_strided = np.expand_dims(np.lib.stride_tricks.as_strided(self.raw, shape=(nrows, length), strides=(stride * n, n)), -1)
+        raw_strided = np.expand_dims(np.lib.stride_tricks.as_strided(self.raw, shape=(nrows, length), strides=(stride * n, n), writeable=False), -1)
         return raw_strided
 
         # # Uncomment to split up into separate batches
@@ -171,6 +173,8 @@ class TrainingRead(Persistent):
         raw_varname = self.hdf['Raw/Reads/'].visit(str)
         raw = self.hdf[f'Raw/Reads/{raw_varname}/Signal'][()]
         first_sample = self.hdf[f'{self.hdf_path}BaseCalled_template/Events'].attrs['read_start_rel_to_raw']
+        # #  Uncomment to print cutoff
+        # print(f'Cutting off {first_sample-1} from raw')
         raw = raw[first_sample-1:]  # NOTE: -1, or you throw away the first sample
         self._raw = normalize_raw_signal(raw, self.normalization)
 
@@ -187,7 +191,7 @@ class TrainingRead(Persistent):
         """
         # Condensed hits is list of raw reads that contain the kmer.
         # The 1st item the condensed event, the 2nd item is the index of the
-        # kmer in the basecalled sequence
+        # kmer in the raw sequence
         condensed_hits = [(ce, idx) for idx, ce
                           in enumerate(self.condensed_events) if ce[0] == kmer]
         width_l = (width + 1) // 2  # if width is even, pick point RIGHT of middle
@@ -234,3 +238,10 @@ class TrainingRead(Persistent):
                 #     raw_kmers_out.append(cur_condensed_event[0])
             idx_list.remove(cur_idx)
         return raw_hits_out, raw_kmers_out
+
+    def get_split_raw_read(self, length, stride=5):
+        """Split into parts of certain length, last split may be shorter"""
+        nrows = ((self.raw.size - length) // stride) + 1
+        n = self.raw.strides[0]
+        raw_strided = np.expand_dims(np.lib.stride_tricks.as_strided(self.raw, shape=(nrows, length), strides=(stride * n, n)), -1)
+        return raw_strided
-- 
GitLab