diff --git a/inference/compile_model.py b/inference/compile_model.py index 27c1f2961c42a30a86e64ad9fc5c51215200da0e..a0f828004a2b43fb2789be064b225ac25303c17c 100644 --- a/inference/compile_model.py +++ b/inference/compile_model.py @@ -19,7 +19,7 @@ with open(f'/lustre/BIF/nobackup/noord087/mscthesis/9mer_library_excluding_10_pe def reverse_complement(km): return ''.join([COMPLEMENT_DICT[k] for k in km][::-1]) -def fa2kmers(fa_fn, kmer_size): +def fa2kmers(fa_fn, kmer_sizes): """ take multifasta file, return dict of lists of k-mers present in each sequence """ @@ -32,17 +32,18 @@ def fa2kmers(fa_fn, kmer_size): for td in targets_dict: with open(tdo + '/target.fasta', 'w') as fh: fh.write(targets_dict[td]) - subprocess.run( - f'jellyfish count -m {kmer_size} -s {4 ** kmer_size} -C -o {tdo}/mer_counts.jf {fa_fn}', - shell=True) - kmer_dump = subprocess.run(f'jellyfish dump -c {tdo}/mer_counts.jf', shell=True, capture_output=True) - kmer_list = [km.split(' ')[0] for km in kmer_dump.stdout.decode('utf-8').split('\n')] - kmer_list = [km for km in kmer_list if len(km) == kmer_size and km in kmer_good_candidate_list] # retain k-mers found to score well in discernability - kmer_list_curated = [] # remove reverse complements - for km in kmer_list: - if reverse_complement(km) in kmer_list_curated: continue - kmer_list_curated.append(km) - kmer_dict[td] = kmer_list_curated + for kmer_size in kmer_sizes: + subprocess.run( + f'jellyfish count -m {kmer_size} -s {4 ** kmer_size} -C -o {tdo}/mer_counts.jf {fa_fn}', + shell=True) + kmer_dump = subprocess.run(f'jellyfish dump -c {tdo}/mer_counts.jf', shell=True, capture_output=True) + kmer_list = [km.split(' ')[0] for km in kmer_dump.stdout.decode('utf-8').split('\n')] + kmer_list = [km for km in kmer_list if len(km) == kmer_size and km in kmer_good_candidate_list] # retain k-mers found to score well in discernability + kmer_list_curated = [] # remove reverse complements + for km in kmer_list: + if reverse_complement(km) in kmer_list_curated: continue + kmer_list_curated.append(km) + kmer_dict[td] = kmer_list_curated return kmer_dict @@ -92,8 +93,13 @@ def get_kmer_candidates_16S(kmer_candidates_dict, min_nb_kmers=5, threshold=0.00 return kmer_candidates_selection_dict[selected_id] -def main(args): +def expandpath(path_pattern): + p = Path(path_pattern).expanduser() + parts = p.parts[p.is_absolute():] + return Path(p.root).glob(str(Path(*parts))) + +def main(args): # List for which k-mers models are available if args.nn_directory: nn_directory = args.nn_directory @@ -103,7 +109,7 @@ def main(args): raise ValueError('Either provide --nn-directory or --target-16S') # List models for which k-mer is available available_mod_kmers = {pth.name: str(pth) for pth in Path(nn_directory).iterdir() if pth.is_dir()} - + available_mod_kmers = {model_pth.parent.name: str(model_pth.parent) for model_pth in expandpath(nn_directory)} # Parse target k-mers if args.kmer_list: with open(args.kmer_list, 'r') as fh: @@ -111,8 +117,8 @@ def main(args): requested_kmers = [km for km in kmers if len(km)] kmers = [km for km in requested_kmers if km in available_mod_kmers] elif args.target_16S: - kmer_size = 9 - kmer_candidates_dict = fa2kmers(args.target_16S, kmer_size) # Collect k-mers per sequence in target fasta + kmer_sizes = [8, 9] + kmer_candidates_dict = fa2kmers(args.target_16S, kmer_sizes) # Collect k-mers per sequence in target fasta if args.train_required: kmers = get_kmer_candidates_16S(kmer_candidates_dict, args.min_nb_models, 0.0001) kmers_no_models = [km for km in kmers if km not in available_mod_kmers]