Skip to content
Snippets Groups Projects
Commit 1a46e299 authored by Lannoy, Carlos de's avatar Lannoy, Carlos de
Browse files

adapt to kmer counting to include 8 and 9 mers

parent 6e4a677d
No related branches found
No related tags found
No related merge requests found
......@@ -12,8 +12,8 @@ from run_production_pipeline import main as rpp
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
COMPLEMENT_DICT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
kmer_freqs_df = pd.read_parquet(f'{__location__}/../data/ncbi_16S_bacteria_archaea_kmer_counts.parquet')
kmer_freqs_dict = dict(kmer_freqs_df.sum(axis=0))
kmer_freqs_df = pd.read_parquet(f'{__location__}/../data/ncbi_16S_bacteria_archaea_counts_8mer_9mer.parquet')
kmer_freqs_dict = kmer_freqs_df.kmer_count.to_dict()
# with open(f'{__location__}/../data/list_of_kmers_after_19_rounds.csv', 'r') as fh:
# kmer_good_candidate_list = [km.strip() for km in fh.readlines() if len(km.strip())]
with open(f'{__location__}/../data/intermediate_9mers.csv', 'r') as fh:
......@@ -62,8 +62,11 @@ def get_kmer_candidates_16S(kmer_candidates_dict, min_nb_kmers=5, threshold=0.00
"""
kmer_candidates_selection_dict = {}
for kc in kmer_candidates_dict:
kmer_list = kmer_candidates_dict[kc]
# kmer_list = sorted(kmer_candidates_dict[kc], key=kmer_freqs_dict.get, reverse=True) # sort by frequency in 16S genomes todo switch off because data is 8-mer only
if filter_list:
prefiltered_kmer_list = [km for km in kmer_candidates_dict[kc] if km in filter_list]
kmer_freqs_dict_cur = {km: kmer_freqs_dict.get(km, 0) for km in prefiltered_kmer_list}
kmer_list = sorted(prefiltered_kmer_list, key=kmer_freqs_dict_cur.get, reverse=False)
# kmer_list = kmer_list[:30] # todo arbitrarily set to 25, change to parameter
kmer_candidates_selection_dict[kc] = kmer_list
# kmer_candidates_selection_dict[kc] = []
# sub_df = kmer_freqs_df.copy()
......@@ -96,8 +99,9 @@ def get_kmer_candidates_16S(kmer_candidates_dict, min_nb_kmers=5, threshold=0.00
# kmer_list.sort(key=sub_freqs_dict.get, reverse=True) # re-sort kmer list
# sub_freqs_dict = dict(sub_df.sum(axis=0))
# cc += 1
if filter_list:
kmer_candidates_selection_dict[kc] = [km for km in kmer_candidates_selection_dict[kc] if km in filter_list]
# if filter_list:
# kmer_candidates_selection_dict[kc] = [km for km in kmer_candidates_selection_dict[kc] if km in filter_list]
kc_count_dict = {kc: len(kmer_candidates_selection_dict[kc]) for kc in kmer_candidates_selection_dict}
selected_id = min(kc_count_dict, key=kc_count_dict.get)
return kmer_candidates_selection_dict[selected_id]
......
import argparse, subprocess, re
import multiprocessing as mp
import pandas as pd
from datetime import datetime
from tempfile import TemporaryDirectory
# def count_for_fa(seq):
# header = re.search('(?<=>)[^ ]+', seq).group(0)
# with TemporaryDirectory() as tdo:
# single_fa_fn = f'{tdo}/seq.fa'
# with open(single_fa_fn, 'w') as fh: fh.write(seq)
# subprocess.run(
# f'jellyfish count -m {kmer_size} -s {4 ** kmer_size} -C -o {tdo}/mer_counts.jf {single_fa_fn}',
# shell=True)
# kmer_dump = subprocess.run(f'jellyfish dump -c {tdo}/mer_counts.jf', shell=True, capture_output=True)
# kmer_list = [km.split(' ')[0] for km in kmer_dump.stdout.decode('utf-8').split('\n')]
# kmer_list = [km for km in kmer_list if len(km)]
# print(f'{datetime.now()}: {header} done')
# return (header, kmer_list)
parser = argparse.ArgumentParser(description='Return k-mer count table for species in multifasta')
parser.add_argument('--fa-fn', type=str, required=True)
parser.add_argument('--out-parquet', type=str, required=True)
parser.add_argument('--kmer-size', type=int, nargs='+', default=[8,9])
parser.add_argument('--cores', type=int, default=4)
args = parser.parse_args()
fa_fn = '/home/carlos/local_data/baseLess/ncbi_16S_bacteria_archaea_subset.fasta'
with open(args.fa_fn, 'r') as fh:
fa_txt = fh.read()
fa_txt_list = ['>' + t for t in fa_txt.split('>') if t]
df_list = []
for ks in args.kmer_size:
print(f'{datetime.now()}: Counting kmers of size {ks}')
with TemporaryDirectory() as tdo:
count_output = subprocess.run(
f'jellyfish count -m {ks} -s {4 ** ks} -C -o {tdo}/mer_counts.jf {args.fa_fn} -t {args.cores}',
shell=True, capture_output=True)
kmer_dump = subprocess.run(f'jellyfish dump -c {tdo}/mer_counts.jf', shell=True, capture_output=True)
kmer_list = [km.split(' ') for km in kmer_dump.stdout.decode('utf-8').split('\n') if len(km.split(' ')) == 2]
km_dict = {kv[0]: int(kv[1]) for kv in kmer_list if len(kv[0]) == ks}
df_list.append(pd.DataFrame.from_dict(km_dict, orient='index', columns=['kmer_count']))
print(f'{datetime.now()}: Done! Merging...')
count_df = pd.concat(df_list)
count_df.sort_values('kmer_count', inplace=True)
count_df.to_parquet(args.out_parquet)
# with mp.Pool(args.cores) as pool:
# count_tups = pool.map(count_for_fa, fa_txt_list)
#
# headers, _ = zip(*count_tups)
# count_df = pd.DataFrame(False, index=headers, columns=kmer_list)
# for header, km_list in count_tups:
# count_df.loc[header, km_list] = True
# count_df.to_parquet(args.out_csv)
import os
import pandas as pd
import numpy as np
baseless_location = "{{ baseless_location }}"
......@@ -35,7 +36,7 @@ rule parse_inference_results:
out_df.loc[os.path.basename(fn), 'predicted'] = True
out_df.loc[:,'pos'] = gt_df.loc[out_df.index, 'pos']
out_df.loc[:, 'tp'] = False
out_df.loc[out_df.query('predicted and pos').index, 'tp'] = True
out_df.loc[np.logical_and(out_df.predicted, out_df.pos), 'tp'] = True
recall = out_df.tp.sum() / out_df.pos.sum()
precision = out_df.tp.sum() / out_df.predicted.sum()
f1 = ((precision ** -1 + recall ** -1) / 2) ** -1
......@@ -46,7 +47,7 @@ rule parse_inference_results:
rule run_inference:
input:
fast5_in='{{ test_read_dir }}',
fast5_in='{{ read_dir }}',
model='{{ out_dir }}compiled_model.h5'
# threads: workflow.cores
threads: 3
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment