Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
baseLess
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lannoy, Carlos de
baseLess
Commits
1a46e299
Commit
1a46e299
authored
2 years ago
by
Lannoy, Carlos de
Browse files
Options
Downloads
Patches
Plain Diff
adapt to kmer counting to include 8 and 9 mers
parent
6e4a677d
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
inference/compile_model.py
+10
-6
10 additions, 6 deletions
inference/compile_model.py
tools/count_kmer_frequencies.py
+57
-0
57 additions, 0 deletions
tools/count_kmer_frequencies.py
validation/quick_benchmark_16s_noTrainRequired.sf
+3
-2
3 additions, 2 deletions
validation/quick_benchmark_16s_noTrainRequired.sf
with
70 additions
and
8 deletions
inference/compile_model.py
+
10
−
6
View file @
1a46e299
...
...
@@ -12,8 +12,8 @@ from run_production_pipeline import main as rpp
__location__
=
os
.
path
.
realpath
(
os
.
path
.
join
(
os
.
getcwd
(),
os
.
path
.
dirname
(
__file__
)))
COMPLEMENT_DICT
=
{
'
A
'
:
'
T
'
,
'
T
'
:
'
A
'
,
'
C
'
:
'
G
'
,
'
G
'
:
'
C
'
}
kmer_freqs_df
=
pd
.
read_parquet
(
f
'
{
__location__
}
/../data/ncbi_16S_bacteria_archaea_
kmer_
counts.parquet
'
)
kmer_freqs_dict
=
dict
(
kmer_freqs_df
.
sum
(
axis
=
0
)
)
kmer_freqs_df
=
pd
.
read_parquet
(
f
'
{
__location__
}
/../data/ncbi_16S_bacteria_archaea_counts
_8mer_9mer
.parquet
'
)
kmer_freqs_dict
=
kmer_freqs_df
.
kmer_count
.
to_dict
(
)
# with open(f'{__location__}/../data/list_of_kmers_after_19_rounds.csv', 'r') as fh:
# kmer_good_candidate_list = [km.strip() for km in fh.readlines() if len(km.strip())]
with
open
(
f
'
{
__location__
}
/../data/intermediate_9mers.csv
'
,
'
r
'
)
as
fh
:
...
...
@@ -62,8 +62,11 @@ def get_kmer_candidates_16S(kmer_candidates_dict, min_nb_kmers=5, threshold=0.00
"""
kmer_candidates_selection_dict
=
{}
for
kc
in
kmer_candidates_dict
:
kmer_list
=
kmer_candidates_dict
[
kc
]
# kmer_list = sorted(kmer_candidates_dict[kc], key=kmer_freqs_dict.get, reverse=True) # sort by frequency in 16S genomes todo switch off because data is 8-mer only
if
filter_list
:
prefiltered_kmer_list
=
[
km
for
km
in
kmer_candidates_dict
[
kc
]
if
km
in
filter_list
]
kmer_freqs_dict_cur
=
{
km
:
kmer_freqs_dict
.
get
(
km
,
0
)
for
km
in
prefiltered_kmer_list
}
kmer_list
=
sorted
(
prefiltered_kmer_list
,
key
=
kmer_freqs_dict_cur
.
get
,
reverse
=
False
)
# kmer_list = kmer_list[:30] # todo arbitrarily set to 25, change to parameter
kmer_candidates_selection_dict
[
kc
]
=
kmer_list
# kmer_candidates_selection_dict[kc] = []
# sub_df = kmer_freqs_df.copy()
...
...
@@ -96,8 +99,9 @@ def get_kmer_candidates_16S(kmer_candidates_dict, min_nb_kmers=5, threshold=0.00
# kmer_list.sort(key=sub_freqs_dict.get, reverse=True) # re-sort kmer list
# sub_freqs_dict = dict(sub_df.sum(axis=0))
# cc += 1
if
filter_list
:
kmer_candidates_selection_dict
[
kc
]
=
[
km
for
km
in
kmer_candidates_selection_dict
[
kc
]
if
km
in
filter_list
]
# if filter_list:
# kmer_candidates_selection_dict[kc] = [km for km in kmer_candidates_selection_dict[kc] if km in filter_list]
kc_count_dict
=
{
kc
:
len
(
kmer_candidates_selection_dict
[
kc
])
for
kc
in
kmer_candidates_selection_dict
}
selected_id
=
min
(
kc_count_dict
,
key
=
kc_count_dict
.
get
)
return
kmer_candidates_selection_dict
[
selected_id
]
...
...
This diff is collapsed.
Click to expand it.
tools/count_kmer_frequencies.py
0 → 100644
+
57
−
0
View file @
1a46e299
import
argparse
,
subprocess
,
re
import
multiprocessing
as
mp
import
pandas
as
pd
from
datetime
import
datetime
from
tempfile
import
TemporaryDirectory
# def count_for_fa(seq):
# header = re.search('(?<=>)[^ ]+', seq).group(0)
# with TemporaryDirectory() as tdo:
# single_fa_fn = f'{tdo}/seq.fa'
# with open(single_fa_fn, 'w') as fh: fh.write(seq)
# subprocess.run(
# f'jellyfish count -m {kmer_size} -s {4 ** kmer_size} -C -o {tdo}/mer_counts.jf {single_fa_fn}',
# shell=True)
# kmer_dump = subprocess.run(f'jellyfish dump -c {tdo}/mer_counts.jf', shell=True, capture_output=True)
# kmer_list = [km.split(' ')[0] for km in kmer_dump.stdout.decode('utf-8').split('\n')]
# kmer_list = [km for km in kmer_list if len(km)]
# print(f'{datetime.now()}: {header} done')
# return (header, kmer_list)
parser
=
argparse
.
ArgumentParser
(
description
=
'
Return k-mer count table for species in multifasta
'
)
parser
.
add_argument
(
'
--fa-fn
'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'
--out-parquet
'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'
--kmer-size
'
,
type
=
int
,
nargs
=
'
+
'
,
default
=
[
8
,
9
])
parser
.
add_argument
(
'
--cores
'
,
type
=
int
,
default
=
4
)
args
=
parser
.
parse_args
()
fa_fn
=
'
/home/carlos/local_data/baseLess/ncbi_16S_bacteria_archaea_subset.fasta
'
with
open
(
args
.
fa_fn
,
'
r
'
)
as
fh
:
fa_txt
=
fh
.
read
()
fa_txt_list
=
[
'
>
'
+
t
for
t
in
fa_txt
.
split
(
'
>
'
)
if
t
]
df_list
=
[]
for
ks
in
args
.
kmer_size
:
print
(
f
'
{
datetime
.
now
()
}
: Counting kmers of size
{
ks
}
'
)
with
TemporaryDirectory
()
as
tdo
:
count_output
=
subprocess
.
run
(
f
'
jellyfish count -m
{
ks
}
-s
{
4
**
ks
}
-C -o
{
tdo
}
/mer_counts.jf
{
args
.
fa_fn
}
-t
{
args
.
cores
}
'
,
shell
=
True
,
capture_output
=
True
)
kmer_dump
=
subprocess
.
run
(
f
'
jellyfish dump -c
{
tdo
}
/mer_counts.jf
'
,
shell
=
True
,
capture_output
=
True
)
kmer_list
=
[
km
.
split
(
'
'
)
for
km
in
kmer_dump
.
stdout
.
decode
(
'
utf-8
'
).
split
(
'
\n
'
)
if
len
(
km
.
split
(
'
'
))
==
2
]
km_dict
=
{
kv
[
0
]:
int
(
kv
[
1
])
for
kv
in
kmer_list
if
len
(
kv
[
0
])
==
ks
}
df_list
.
append
(
pd
.
DataFrame
.
from_dict
(
km_dict
,
orient
=
'
index
'
,
columns
=
[
'
kmer_count
'
]))
print
(
f
'
{
datetime
.
now
()
}
: Done! Merging...
'
)
count_df
=
pd
.
concat
(
df_list
)
count_df
.
sort_values
(
'
kmer_count
'
,
inplace
=
True
)
count_df
.
to_parquet
(
args
.
out_parquet
)
# with mp.Pool(args.cores) as pool:
# count_tups = pool.map(count_for_fa, fa_txt_list)
#
# headers, _ = zip(*count_tups)
# count_df = pd.DataFrame(False, index=headers, columns=kmer_list)
# for header, km_list in count_tups:
# count_df.loc[header, km_list] = True
# count_df.to_parquet(args.out_csv)
This diff is collapsed.
Click to expand it.
validation/quick_benchmark_16s_noTrainRequired.sf
+
3
−
2
View file @
1a46e299
import os
import pandas as pd
import numpy as np
baseless_location = "{{ baseless_location }}"
...
...
@@ -35,7 +36,7 @@ rule parse_inference_results:
out_df.loc[os.path.basename(fn), 'predicted'] = True
out_df.loc[:,'pos'] = gt_df.loc[out_df.index, 'pos']
out_df.loc[:, 'tp'] = False
out_df.loc[
out_df.query('predicted and pos').index
, 'tp'] = True
out_df.loc[
np.logical_and(out_df.predicted, out_df.pos)
, 'tp'] = True
recall = out_df.tp.sum() / out_df.pos.sum()
precision = out_df.tp.sum() / out_df.predicted.sum()
f1 = ((precision ** -1 + recall ** -1) / 2) ** -1
...
...
@@ -46,7 +47,7 @@ rule parse_inference_results:
rule run_inference:
input:
fast5_in='{{
test_
read_dir }}',
fast5_in='{{ read_dir }}',
model='{{ out_dir }}compiled_model.h5'
# threads: workflow.cores
threads: 3
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment