Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scripts
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
UNLOCK
scripts
Commits
0e02a472
Commit
0e02a472
authored
3 years ago
by
Jasper Koehorst
Browse files
Options
Downloads
Patches
Plain Diff
ontology update
parent
7a4418f2
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
ngtax_to_biom.py
+22
-12
22 additions, 12 deletions
ngtax_to_biom.py
ngtax_to_tsv-fasta.py
+2
-2
2 additions, 2 deletions
ngtax_to_tsv-fasta.py
picrust_to_biom.py
+34
-29
34 additions, 29 deletions
picrust_to_biom.py
with
58 additions
and
43 deletions
ngtax_to_biom.py
+
22
−
12
View file @
0e02a472
...
...
@@ -45,18 +45,23 @@ def process_job_file():
# Obtain the job file
# job_file = session.data_objects.get(args.job)
# Create store directory
if
not
os
.
path
.
isdir
(
"
irods
"
):
os
.
mkdir
(
"
irods
"
)
destinations
=
{
"
seq.tsv
"
:
open
(
"
seq.tsv
"
,
"
w
"
),
"
asv.tsv
"
:
open
(
"
asv.tsv
"
,
"
w
"
),
"
tax.tsv
"
:
open
(
"
tax.tsv
"
,
"
w
"
),
"
met.tsv
"
:
open
(
"
met.tsv
"
,
"
w
"
)
}
# For line in the job file
for
line
in
open
(
args
.
job
):
lines
=
open
(
args
.
job
).
readlines
()
for
index
,
line
in
enumerate
(
lines
):
line
=
line
.
strip
()
# .decode("UTF-8").strip()
print
(
"
Processing
"
,
line
)
print
(
"
Processing
"
,
index
,
"
of
"
,
len
(
lines
),
line
)
# Old version of job file had ttl project file but with latest workflow not needed
if
line
.
endswith
(
"
.ttl
"
):
continue
# List the files in this folder using the walk function
...
...
@@ -65,14 +70,19 @@ def process_job_file():
for
root
,
dirs
,
files
in
walk
:
# For each file in this directory / sub directories?
for
file
in
files
:
if
os
.
path
.
exists
(
"
./irods/
"
+
file
.
name
):
continue
# Extra check if there is no accidental file created in there e.g. .DS_Store
if
file
.
path
.
split
(
"
_
"
)[
-
1
]
in
destinations
:
# Obtain output writer
output
=
destinations
[
file
.
path
.
split
(
"
_
"
)[
-
1
]]
# For each line in this file
for
line_file
in
file
.
open
():
line_file
=
line_file
.
decode
(
"
UTF-8
"
).
strip
()
print
(
line_file
,
file
=
output
)
# Download the file
session
.
data_objects
.
get
(
file
.
path
,
"
./irods/
"
+
file
.
name
)
for
file_name
in
os
.
listdir
(
"
./irods
"
):
# Obtain output writer
output
=
destinations
[
file_name
.
split
(
"
_
"
)[
-
1
]]
for
line_file
in
open
(
"
./irods/
"
+
file_name
):
line_file
=
line_file
.
strip
()
print
(
line_file
,
file
=
output
)
# Close and flush all writers
for
writer
in
destinations
:
destinations
[
writer
].
close
()
...
...
@@ -103,8 +113,8 @@ def biom_preformatter():
df
.
index
=
asv_index
for
index
,
line
in
enumerate
(
lines
):
if
index
%
1000
==
0
:
print
(
index
)
if
index
%
1000
0
==
0
:
print
(
index
,
len
(
lines
)
)
sample
,
asv
,
value
=
line
df
.
loc
[
asv
,
sample
]
=
value
...
...
This diff is collapsed.
Click to expand it.
ngtax_to_tsv-fasta.py
+
2
−
2
View file @
0e02a472
...
...
@@ -153,7 +153,7 @@ def generate_met():
SELECT DISTINCT ?id ?predicate ?object
WHERE {
?assay ?predicate ?object .
?assay a
unlock:Amplicon
Assay .
?assay a
jerm:
Assay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}
"""
)
...
...
@@ -176,7 +176,7 @@ def generate_met():
?sample ?predicate ?object .
OPTIONAL { ?predicate rdfs:label ?predicate_label}
?sample jerm:hasPart ?assay .
?assay a
unlock:Amplicon
Assay .
?assay a
jerm:
Assay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}
"""
)
...
...
This diff is collapsed.
Click to expand it.
picrust_to_biom.py
+
34
−
29
View file @
0e02a472
...
...
@@ -7,6 +7,7 @@ __license__ = "CC0"
__version__
=
"
1.0.0
"
__status__
=
"
Development
"
import
argparse
import
gzip
import
json
from
irods.session
import
iRODSSession
...
...
@@ -14,8 +15,12 @@ import ssl
import
os
import
argparse
import
pandas
as
pd
import
rdflib
from
rdflib.namespace
import
Namespace
from
rdflib
import
Graph
import
irods.keywords
as
kw
from
rdflib.term
import
Literal
from
biom
import
load_table
from
collections
import
OrderedDict
host
=
os
.
getenv
(
'
irodsHost
'
)
port
=
os
.
getenv
(
'
irodsPort
'
)
...
...
@@ -36,6 +41,9 @@ ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
# Obtain the data for the biom files
def
process_job_file
():
if
not
os
.
path
.
isdir
(
"
./irods
"
):
os
.
mkdir
(
"
irods
"
)
# Connect with irods
with
iRODSSession
(
host
=
host
,
...
...
@@ -59,18 +67,16 @@ def process_job_file():
writers
.
append
(
writer
)
# For line in the job file
for
line
in
open
(
args
.
job
):
lines
=
open
(
args
.
job
).
readlines
()
for
index
,
line
in
enumerate
(
lines
):
line
=
line
.
strip
()
# .decode("UTF-8").strip()
print
(
"
Processing
"
,
line
)
print
(
"
Processing
"
,
index
,
"
of
"
,
len
(
lines
),
line
)
# Metadata file
if
line
.
endswith
(
"
.ttl
"
):
rdf_file
=
session
.
data_objects
.
get
(
line
)
name
=
rdf_file
.
path
.
split
(
"
/
"
)[
-
1
]
output
=
open
(
name
,
"
w
"
)
for
line
in
rdf_file
.
open
():
print
(
line
.
decode
(
"
UTF-8
"
).
strip
(),
file
=
output
)
output
.
close
()
process_rdf_files
(
name
)
file_name
=
"
./irods/
"
+
line
.
split
(
"
/
"
)[
-
1
]
if
not
os
.
path
.
isfile
(
file_name
):
session
.
data_objects
.
get
(
line
,
file_name
,
**
options
)
process_rdf_files
(
file_name
)
else
:
# List the files in this folder using the walk function
if
not
session
.
collections
.
exists
(
line
):
...
...
@@ -99,11 +105,20 @@ def process_job_file():
output
=
writers
[
5
]
# If no match was found output is still None
if
output
==
None
:
continue
# Assay split
if
"
/A_
"
in
file
.
path
:
sample_id
=
file
.
path
.
split
(
"
/A_
"
)[
-
1
].
split
(
"
/
"
)[
0
]
if
"
/ASY_
"
in
file
.
path
:
sample_id
=
file
.
path
.
split
(
"
/ASY_
"
)[
-
1
].
split
(
"
/
"
)[
0
]
# For each line in this file
file_name
=
file
.
path
.
split
(
"
/
"
)[
-
1
]
session
.
data_objects
.
get
(
file
.
path
,
file_name
,
**
options
)
sample_id
=
file
.
path
.
split
(
"
/A_
"
)[
-
1
].
split
(
"
/
"
)[
0
]
file_name
=
"
./irods/
"
+
sample_id
+
"
_
"
+
file
.
path
.
split
(
"
/
"
)[
-
1
]
# Download when not available
if
not
os
.
path
.
isfile
(
file_name
):
session
.
data_objects
.
get
(
file
.
path
,
file_name
,
**
options
)
content
=
gzip
.
open
(
file_name
,
mode
=
'
r
'
).
readlines
()
for
line_file
in
content
:
line_file
=
line_file
.
decode
(
"
UTF-8
"
).
strip
()
# Skip function line
...
...
@@ -112,6 +127,7 @@ def process_job_file():
print
(
line
,
file
=
output
)
# Close and flush all writers
print
(
"
Closing all writers
"
)
for
writer
in
writers
:
writer
.
close
()
...
...
@@ -129,7 +145,6 @@ def process_job_file():
df
=
df
.
pivot
(
index
=
'
X
'
,
columns
=
'
Y
'
,
values
=
'
Z
'
)
df
.
to_csv
(
file
,
sep
=
"
\t
"
,
index_label
=
False
)
metadata_file
=
"
metadata.picrust.tsv
"
remove_duplicates
(
metadata_file
)
...
...
@@ -177,18 +192,13 @@ def remove_duplicates(input_file):
output
.
close
()
def
process_rdf_files
(
rdf_file
):
g
=
rdflib
.
Graph
()
print
(
"
Processing rdf file
"
,
rdf_file
)
g
=
Graph
()
g
.
bind
(
"
unlock
"
,
Namespace
(
"
http://m-unlock.nl/ontology/
"
))
g
.
bind
(
"
schema
"
,
Namespace
(
"
http://schema.org/
"
))
g
.
parse
(
rdf_file
,
format
=
"
turtle
"
)
output
=
open
(
"
metadata.picrust.tsv
"
,
"
a
"
)
qres
=
g
.
query
(
"""
PREFIX gbol:<http://gbol.life/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ssb: <http://ssb.wur.nl/>
PREFIX ns1: <http://ssb.wur.nl/model/>
PREFIX unlock: <http://m-unlock.nl/ontology/>
PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?id ?predicate ?object
WHERE {
?assay ?predicate ?object .
...
...
@@ -228,7 +238,7 @@ def process_rdf_files(rdf_file):
for
row
in
qres
:
predicate
=
"
sample_
"
+
row
.
predicate
.
split
(
"
/
"
)[
-
1
]
if
type
(
row
.
predicate_label
)
==
rdflib
.
term
.
Literal
:
if
type
(
row
.
predicate_label
)
==
Literal
:
predicate
=
"
sample_
"
+
row
.
predicate_label
.
replace
(
"
"
,
"
_
"
)
identifier
=
row
.
id
obj
=
row
.
object
...
...
@@ -243,9 +253,6 @@ def process_rdf_files(rdf_file):
def
tsv_to_biom
(
input_file
):
from
biom
import
load_table
from
collections
import
OrderedDict
# Formatting
try
:
result
=
pd
.
read_table
(
input_file
)
...
...
@@ -291,8 +298,6 @@ def tsv_to_biom(input_file):
if
__name__
==
"
__main__
"
:
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
'
Biom file creation
'
)
parser
.
add_argument
(
'
-j
'
,
'
--job
'
,
help
=
'
input Job file
'
,
required
=
True
)
parser
.
add_argument
(
'
-i
'
,
'
--identifier
'
,
help
=
'
Prefix identifier
'
,
required
=
True
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment