Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scripts
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
UNLOCK
scripts
Commits
59b50172
Commit
59b50172
authored
3 years ago
by
Jasper Koehorst
Browse files
Options
Downloads
Patches
Plain Diff
biom creation for picrust data
parent
27198b8f
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
picrust_to_biom.py
+257
-0
257 additions, 0 deletions
picrust_to_biom.py
with
257 additions
and
0 deletions
picrust_to_biom.py
0 → 100755
+
257
−
0
View file @
59b50172
#!/usr/bin/python3
__author__
=
"
Bart Nijsse and Jasper Koehorst
"
__copyright__
=
"
Copyright 2021, UNLOCK
"
__credits__
=
[
"
Bart Nijsse
"
,
"
Jasper Koehorst
"
]
__license__
=
"
CC0
"
__version__
=
"
1.0.0
"
__status__
=
"
Development
"
import
gzip
import
json
from
irods.session
import
iRODSSession
import
ssl
import
os
import
argparse
from
jedi.api
import
file_name
import
pandas
as
pd
import
rdflib
import
base64
import
irods.keywords
as
kw
host
=
os
.
getenv
(
'
irodsHost
'
)
port
=
os
.
getenv
(
'
irodsPort
'
)
zone
=
os
.
getenv
(
'
irodsZone
'
)
user
=
os
.
getenv
(
'
irodsUserName
'
)
password
=
os
.
getenv
(
'
irodsPassword
'
)
# SSL settings
context
=
ssl
.
create_default_context
(
purpose
=
ssl
.
Purpose
.
SERVER_AUTH
,
cafile
=
None
,
capath
=
None
,
cadata
=
None
)
ssl_settings
=
{
'
irods_client_server_negotiation
'
:
'
request_server_negotiation
'
,
'
irods_client_server_policy
'
:
'
CS_NEG_REQUIRE
'
,
'
irods_encryption_algorithm
'
:
'
AES-256-CBC
'
,
'
irods_encryption_key_size
'
:
32
,
'
irods_encryption_num_hash_rounds
'
:
16
,
'
irods_encryption_salt_size
'
:
8
,
'
ssl_context
'
:
context
}
# Obtain the data for the biom files
def
process_job_file
():
# Connect with irods
with
iRODSSession
(
host
=
host
,
port
=
port
,
user
=
user
,
password
=
password
,
zone
=
zone
,
**
ssl_settings
)
as
session
:
options
=
{
kw
.
FORCE_FLAG_KW
:
True
,
}
# Obtain the job file
# job_file = session.data_objects.get(args.job)
file_names
=
[
"
combined_COG_pred_metagenome_unstrat.tsv
"
,
"
combined_EC_pred_metagenome_unstrat.tsv
"
,
"
combined_KO_pred_metagenome_unstrat.tsv
"
,
"
combined_PFAM_pred_metagenome_unstrat.tsv
"
,
"
combined_TIGRFAM_pred_metagenome_unstrat.tsv
"
]
writers
=
[]
for
index
,
element
in
enumerate
(
file_names
):
writer
=
open
(
file_names
[
index
],
"
w
"
)
writers
.
append
(
writer
)
# For line in the job file
for
line
in
open
(
args
.
job
):
line
=
line
.
strip
()
# .decode("UTF-8").strip()
print
(
"
Processing
"
,
line
)
# Metadata file
if
line
.
endswith
(
"
.ttl
"
):
rdf_file
=
session
.
data_objects
.
get
(
line
)
name
=
rdf_file
.
path
.
split
(
"
/
"
)[
-
1
]
output
=
open
(
name
,
"
w
"
)
for
line
in
rdf_file
.
open
():
print
(
line
.
decode
(
"
UTF-8
"
).
strip
(),
file
=
output
)
output
.
close
()
process_rdf_files
(
name
)
else
:
# List the files in this folder using the walk function
if
not
session
.
collections
.
exists
(
line
):
print
(
"
Path does not exists
"
,
line
)
continue
walk
=
session
.
collections
.
get
(
line
).
walk
()
# Just loop over the files as they are irods objects
for
root
,
dirs
,
files
in
walk
:
# For each file in this directory / sub directories?
for
file
in
files
:
if
"
pred_metagenome_unstrat.tsv
"
in
file
.
path
:
# Output object set to None
output
=
None
# Obtain output writer
if
"
COG_metagenome_out
"
in
file
.
path
:
output
=
writers
[
0
]
if
"
EC_metagenome_out
"
in
file
.
path
:
output
=
writers
[
1
]
if
"
KO_metagenome_out
"
in
file
.
path
:
output
=
writers
[
2
]
if
"
PFAM_metagenome_out
"
in
file
.
path
:
output
=
writers
[
3
]
if
"
TIGRFAM_metagenome_out
"
in
file
.
path
:
output
=
writers
[
4
]
# If no match was found output is still None
if
output
==
None
:
continue
# For each line in this file
file_name
=
file
.
path
.
split
(
"
/
"
)[
-
1
]
session
.
data_objects
.
get
(
file
.
path
,
file_name
,
**
options
)
sample_id
=
file
.
path
.
split
(
"
/A_
"
)[
-
1
].
split
(
"
/
"
)[
0
]
content
=
gzip
.
open
(
file_name
,
mode
=
'
r
'
).
readlines
()
for
line_file
in
content
:
line_file
=
line_file
.
decode
(
"
UTF-8
"
).
strip
()
# Skip function line
if
"
function
"
in
line_file
:
continue
line
=
sample_id
+
"
\t
"
+
line_file
print
(
line
,
file
=
output
)
# Close and flush all writers
for
writer
in
writers
:
writer
.
close
()
# Three columns to matrix method
for
file
in
file_names
:
content
=
[]
with
open
(
file
)
as
reader
:
for
line
in
reader
:
line
=
line
.
strip
().
split
()
content
.
append
(
line
)
df
=
pd
.
DataFrame
(
content
,
columns
=
[
'
Y
'
,
'
X
'
,
'
Z
'
])
df
=
df
.
pivot
(
index
=
'
X
'
,
columns
=
'
Y
'
,
values
=
'
Z
'
)
df
.
to_csv
(
file
,
sep
=
"
\t
"
,
index_label
=
False
)
metadata
=
"
metadata.picrust.tsv
"
content
=
[]
with
open
(
metadata
)
as
reader
:
lines
=
set
(
reader
.
readlines
())
for
line
in
lines
:
line
=
line
.
strip
().
split
(
"
\t
"
)
content
.
append
(
line
)
df
=
pd
.
DataFrame
(
content
,
columns
=
[
'
Y
'
,
'
X
'
,
'
Z
'
])
df
=
df
.
pivot
(
index
=
'
X
'
,
columns
=
'
Y
'
,
values
=
'
Z
'
)
df
=
df
.
fillna
(
"
None
"
)
df
.
to_csv
(
metadata
,
sep
=
"
\t
"
,
index_label
=
False
)
# Create biom files
for
file
in
file_names
:
tsv_to_biom
(
file
)
def
process_rdf_files
(
rdf_file
):
g
=
rdflib
.
Graph
()
g
.
parse
(
rdf_file
,
format
=
"
turtle
"
)
output
=
open
(
"
metadata.picrust.tsv
"
,
"
a
"
)
qres
=
g
.
query
(
"""
PREFIX gbol:<http://gbol.life/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ssb: <http://ssb.wur.nl/>
PREFIX ns1: <http://ssb.wur.nl/model/>
PREFIX unlock: <http://m-unlock.nl/ontology/>
PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?id ?predicate ?object
WHERE {
?assay ?predicate ?object .
?assay a unlock:AmpliconAssay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}
"""
)
for
row
in
qres
:
predicate
=
"
assay_
"
+
row
.
predicate
.
split
(
"
/
"
)[
-
1
]
identifier
=
row
.
id
print
(
identifier
+
"
\t
"
+
predicate
+
f
"
\t
{
row
.
object
}
"
,
file
=
output
)
qres
=
g
.
query
(
"""
PREFIX gbol:<http://gbol.life/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ssb: <http://ssb.wur.nl/>
PREFIX ns1: <http://ssb.wur.nl/model/>
PREFIX unlock: <http://m-unlock.nl/ontology/>
PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?id ?predicate ?object ?predicate_label
WHERE {
?sample a jerm:Sample .
?sample ?predicate ?object .
OPTIONAL { ?predicate rdfs:label ?predicate_label}
?sample jerm:hasPart ?assay .
?assay a unlock:AmpliconAssay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}
"""
)
for
row
in
qres
:
predicate
=
"
sample_
"
+
row
.
predicate
.
split
(
"
/
"
)[
-
1
]
if
type
(
row
.
predicate_label
)
==
rdflib
.
term
.
Literal
:
predicate
=
"
sample_
"
+
row
.
predicate_label
.
replace
(
"
"
,
"
_
"
)
identifier
=
row
.
id
print
(
identifier
+
"
\t
"
+
predicate
+
f
"
\t
{
row
.
object
}
"
,
file
=
output
)
def
tsv_to_biom
(
input_file
):
from
biom
import
load_table
from
collections
import
OrderedDict
# Formatting
result
=
pd
.
read_table
(
input_file
)
result
.
fillna
(
0
,
inplace
=
True
)
result
=
result
.
round
(
0
)
result
.
to_csv
(
input_file
,
sep
=
"
\t
"
)
# Transform to biom tsv format
lines
=
open
(
input_file
).
readlines
()
lines
.
insert
(
0
,
"
# Automatically generated input file for biom conversion
"
)
lines
[
1
]
=
"
#OTU ID
\t
"
+
lines
[
1
]
output
=
open
(
input_file
,
"
w
"
)
for
line
in
lines
:
print
(
line
.
strip
(),
file
=
output
)
biom_content
=
load_table
(
input_file
)
# Load RDF metadata file
metadata
=
pd
.
read_table
(
"
metadata.picrust.tsv
"
).
to_dict
()
# print(metadata.keys())
# Add metadata
biom_content
.
add_metadata
(
metadata
)
biom_content
.
type
=
"
Function table
"
json_data
=
biom_content
.
to_json
(
generated_by
=
"
UNLOCK conversion module
"
)
# Create Python object from JSON string data
obj
=
json
.
loads
(
json_data
)
# TODO find out where this 0 element comes from (pandas?)
obj
[
'
columns
'
]
=
obj
[
'
columns
'
][
1
:]
for
row
in
obj
[
'
rows
'
]:
row
[
'
metadata
'
]
=
{
"
taxonomy
"
:[
"
k__NA
"
,
"
p__NA
"
,
"
c__NA
"
,
"
o__NA
"
,
"
f__NA
"
,
"
g__NA
"
,
"
s__NA
"
]}
obj
[
'
shape
'
][
1
]
=
int
(
obj
[
'
shape
'
][
1
])
-
1
# Pretty Print JSON
json_formatted_str
=
json
.
dumps
(
obj
,
indent
=
4
,
sort_keys
=
True
)
biom_file
=
input_file
.
replace
(
"
.tsv
"
,
"
.biom
"
)
print
(
"
Writing biom file to
"
,
biom_file
)
print
(
json_formatted_str
,
file
=
open
(
biom_file
,
"
w
"
))
if
__name__
==
"
__main__
"
:
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
'
Biom file creation
'
)
parser
.
add_argument
(
'
-j
'
,
'
--job
'
,
help
=
'
input Job file
'
,
required
=
True
)
args
=
parser
.
parse_args
()
job
=
args
.
job
if
os
.
path
.
isfile
(
"
metadata.picrust.tsv
"
):
os
.
remove
(
"
metadata.picrust.tsv
"
)
# Process the job file to create a biom json file
process_job_file
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment