Skip to content
Snippets Groups Projects
Commit b34a3ca0 authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

Database building now more 'conservative' so we are absolutely sure about...

Database building now more 'conservative' so we are absolutely sure about correctness of positive examples
parent e518ba7c
No related branches found
No related tags found
1 merge request!5compare_accuracy.py now saves confusion matrices and can be called on the...
......@@ -145,21 +145,13 @@ class TrainingRead(Persistent):
event_states_sl = self.hdf[hdf_events_path]["base"]
event_states_sl = event_states_sl.astype(str)
kmer_overhang = self.kmer_size // 2 # TODO is this a problem for even k-mer sizes?
event_states_sl = np.insert(event_states_sl, 0, ['N'] * kmer_overhang)
event_states_sl = np.append(event_states_sl, ['N'] * kmer_overhang)
event_list = [''.join(event_states_sl[i:i + self.kmer_size])
for i in range(0, event_states_sl.size - self.kmer_size + 1)]
start_idx_list = np.concatenate((self.hdf[hdf_events_path]["start"],
[self.hdf[hdf_events_path]["start"][-1] + self.hdf[hdf_events_path]["length"][-1]])) # Require adding to start_idx_list as zip will otherwise cut off last value!
event_raw_list = [self.raw[b:e] for b, e in zip(start_idx_list[:-1], start_idx_list[1:])]
start_idx_list = self.hdf[hdf_events_path]["start"][:- self.kmer_size + 1]
# Todo for now we throw away the last couple of k-mers
event_raw_list = [self.raw[b:e] for b, e in zip(start_idx_list[:-1], start_idx_list[self.kmer_size:])]
event_length_list = list(self.hdf[hdf_events_path]["length"])
kmer_placeholder = ['N' * self.kmer_size]
if self.clipped_bases_start != 0:
event_list[:self.clipped_bases_start] = kmer_placeholder * self.clipped_bases_start
if self.clipped_bases_end != 0:
event_list[-self.clipped_bases_end:] = kmer_placeholder * self.clipped_bases_end
self.condensed_events = list(zip(event_list, # k-mers
start_idx_list, # index of first base in raw read
event_raw_list)) # raw data points in event
......@@ -173,7 +165,7 @@ class TrainingRead(Persistent):
raw = raw[first_sample-1:] # NOTE: -1, or you throw away the first sample
self._raw = normalize_raw_signal(raw, self.normalization)
def get_pos(self, kmers, width, uncenter_kmer=False, nb=None):
def get_pos(self, kmers, width, uncenter_kmer=True, nb=None):
"""Return raw reads of length width with the target kmer in them
:param kmers: tuple of Kmers to target
......@@ -193,16 +185,21 @@ class TrainingRead(Persistent):
width_r = width - width_l
raw_hits_out = []
for ch in condensed_hits:
for ii in range(len(ch[0][2])):
for ch, _ in condensed_hits:
# Data augmentation, works if uncenter_kmer == true:
# place every example in 10 different places
for _ in range(10):
if uncenter_kmer:
random_offset = random.randint(-width_r, width_r)
max_offset = width_r - len(ch[2]) // 2 - 1
random_offset = random.randint(-max_offset, max_offset)
width_l -= random_offset
width_r += random_offset
mid_idx = ch[0][1] + ii # todo does this return the middle of an event
raw_hits_out.append(self.raw[mid_idx - width_l:
mid_idx + width_r])
mid_idx = ch[1] + len(ch[2])//2
candidate_raw = self.raw[mid_idx - width_l:
mid_idx + width_r]
assert np.alltrue(np.isin(ch[2], candidate_raw)), 'K-mer not in positive read'
raw_hits_out.append(candidate_raw)
return raw_hits_out
def get_neg(self, kmers, width, nb):
......@@ -227,4 +224,5 @@ class TrainingRead(Persistent):
raw_kmers_out.append(cur_condensed_event[0])
idx_list.remove(cur_idx)
# todo assert in some way that the target is indeed not present
return raw_hits_out, raw_kmers_out
......@@ -56,7 +56,7 @@ def main(args):
if not args.silent and percentage_processed >= count_pct_lim:
print(f'{percentage_processed}% of reads processed, {db.nb_pos} positives in DB')
count_pct_lim += 5
except Exception as e:
except KeyError as e:
with open(error_fn, 'a') as efn:
efn.write('{fn}\t{err}\n'.format(err=e, fn=basename(file)))
continue
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment