From 500c71106332a6a51cf1c0a9a470ae31c821bd28 Mon Sep 17 00:00:00 2001 From: Ben Noordijk <ben.noordijk@wur.nl> Date: Tue, 9 Nov 2021 17:20:09 +0100 Subject: [PATCH] Raw read now also cuts off first part of initial raw signal --- db_building/TrainingRead.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/db_building/TrainingRead.py b/db_building/TrainingRead.py index c6d6b3e..2809b60 100755 --- a/db_building/TrainingRead.py +++ b/db_building/TrainingRead.py @@ -7,7 +7,6 @@ from random import choice from persistent import Persistent from pathlib import Path from helper_functions import normalize_raw_signal -import math sys.path.append(f'{dirname(Path(__file__).resolve())}/..') @@ -32,13 +31,16 @@ class Read: def raw(self, _): raw_varname = self.hdf['Raw/Reads/'].visit(str) raw = self.hdf[f'Raw/Reads/{raw_varname}/Signal'][()] + # Cutoff first 1500 signals to be consistent with + # TrainingRead and its normalization + raw = raw[1500:] self._raw = normalize_raw_signal(raw, self.normalization) def get_split_raw_read(self, length, stride=5): """Split into parts of certain length, last split may be shorter""" nrows = ((self.raw.size - length) // stride) + 1 n = self.raw.strides[0] - raw_strided = np.expand_dims(np.lib.stride_tricks.as_strided(self.raw, shape=(nrows, length), strides=(stride * n, n)), -1) + raw_strided = np.expand_dims(np.lib.stride_tricks.as_strided(self.raw, shape=(nrows, length), strides=(stride * n, n), writeable=False), -1) return raw_strided # # Uncomment to split up into separate batches @@ -171,6 +173,8 @@ class TrainingRead(Persistent): raw_varname = self.hdf['Raw/Reads/'].visit(str) raw = self.hdf[f'Raw/Reads/{raw_varname}/Signal'][()] first_sample = self.hdf[f'{self.hdf_path}BaseCalled_template/Events'].attrs['read_start_rel_to_raw'] + # # Uncomment to print cutoff + # print(f'Cutting off {first_sample-1} from raw') raw = raw[first_sample-1:] # NOTE: -1, or you throw away the first sample self._raw = normalize_raw_signal(raw, self.normalization) @@ -187,7 +191,7 @@ class TrainingRead(Persistent): """ # Condensed hits is list of raw reads that contain the kmer. # The 1st item the condensed event, the 2nd item is the index of the - # kmer in the basecalled sequence + # kmer in the raw sequence condensed_hits = [(ce, idx) for idx, ce in enumerate(self.condensed_events) if ce[0] == kmer] width_l = (width + 1) // 2 # if width is even, pick point RIGHT of middle @@ -234,3 +238,10 @@ class TrainingRead(Persistent): # raw_kmers_out.append(cur_condensed_event[0]) idx_list.remove(cur_idx) return raw_hits_out, raw_kmers_out + + def get_split_raw_read(self, length, stride=5): + """Split into parts of certain length, last split may be shorter""" + nrows = ((self.raw.size - length) // stride) + 1 + n = self.raw.strides[0] + raw_strided = np.expand_dims(np.lib.stride_tricks.as_strided(self.raw, shape=(nrows, length), strides=(stride * n, n)), -1) + return raw_strided -- GitLab