Skip to content
Snippets Groups Projects
Commit 0e02a472 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

ontology update

parent 7a4418f2
Branches
No related tags found
No related merge requests found
......@@ -45,18 +45,23 @@ def process_job_file():
# Obtain the job file
# job_file = session.data_objects.get(args.job)
# Create store directory
if not os.path.isdir("irods"):
os.mkdir("irods")
destinations = {
"seq.tsv" : open("seq.tsv", "w"),
"asv.tsv" : open("asv.tsv", "w"),
"tax.tsv" : open("tax.tsv", "w"),
"met.tsv" : open("met.tsv", "w")
}
# For line in the job file
for line in open(args.job):
lines = open(args.job).readlines()
for index, line in enumerate(lines):
line = line.strip() # .decode("UTF-8").strip()
print("Processing", line)
print("Processing", index,"of",len(lines), line)
# Old version of job file had ttl project file but with latest workflow not needed
if line.endswith(".ttl"): continue
# List the files in this folder using the walk function
......@@ -65,14 +70,19 @@ def process_job_file():
for root, dirs, files in walk:
# For each file in this directory / sub directories?
for file in files:
if os.path.exists("./irods/" + file.name): continue
# Extra check if there is no accidental file created in there e.g. .DS_Store
if file.path.split("_")[-1] in destinations:
# Obtain output writer
output = destinations[file.path.split("_")[-1]]
# For each line in this file
for line_file in file.open():
line_file = line_file.decode("UTF-8").strip()
print(line_file, file=output)
# Download the file
session.data_objects.get(file.path, "./irods/" + file.name)
for file_name in os.listdir("./irods"):
# Obtain output writer
output = destinations[file_name.split("_")[-1]]
for line_file in open("./irods/" + file_name):
line_file = line_file.strip()
print(line_file, file=output)
# Close and flush all writers
for writer in destinations:
destinations[writer].close()
......@@ -103,8 +113,8 @@ def biom_preformatter():
df.index = asv_index
for index, line in enumerate(lines):
if index % 1000 == 0:
print(index)
if index % 10000 == 0:
print(index, len(lines))
sample, asv, value = line
df.loc[asv, sample]=value
......
......@@ -153,7 +153,7 @@ def generate_met():
SELECT DISTINCT ?id ?predicate ?object
WHERE {
?assay ?predicate ?object .
?assay a unlock:AmpliconAssay .
?assay a jerm:Assay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}""")
......@@ -176,7 +176,7 @@ def generate_met():
?sample ?predicate ?object .
OPTIONAL { ?predicate rdfs:label ?predicate_label}
?sample jerm:hasPart ?assay .
?assay a unlock:AmpliconAssay .
?assay a jerm:Assay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}""")
......
......@@ -7,6 +7,7 @@ __license__ = "CC0"
__version__ = "1.0.0"
__status__ = "Development"
import argparse
import gzip
import json
from irods.session import iRODSSession
......@@ -14,8 +15,12 @@ import ssl
import os
import argparse
import pandas as pd
import rdflib
from rdflib.namespace import Namespace
from rdflib import Graph
import irods.keywords as kw
from rdflib.term import Literal
from biom import load_table
from collections import OrderedDict
host = os.getenv('irodsHost')
port = os.getenv('irodsPort')
......@@ -36,6 +41,9 @@ ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
# Obtain the data for the biom files
def process_job_file():
if not os.path.isdir("./irods"):
os.mkdir("irods")
# Connect with irods
with iRODSSession(
host = host,
......@@ -59,18 +67,16 @@ def process_job_file():
writers.append(writer)
# For line in the job file
for line in open(args.job):
lines = open(args.job).readlines()
for index, line in enumerate(lines):
line = line.strip() # .decode("UTF-8").strip()
print("Processing", line)
print("Processing", index, "of", len(lines), line)
# Metadata file
if line.endswith(".ttl"):
rdf_file = session.data_objects.get(line)
name = rdf_file.path.split("/")[-1]
output = open(name, "w")
for line in rdf_file.open():
print(line.decode("UTF-8").strip(), file=output)
output.close()
process_rdf_files(name)
file_name = "./irods/" + line.split("/")[-1]
if not os.path.isfile(file_name):
session.data_objects.get(line, file_name, **options)
process_rdf_files(file_name)
else:
# List the files in this folder using the walk function
if not session.collections.exists(line):
......@@ -99,11 +105,20 @@ def process_job_file():
output = writers[5]
# If no match was found output is still None
if output == None: continue
# Assay split
if "/A_" in file.path:
sample_id = file.path.split("/A_")[-1].split("/")[0]
if "/ASY_" in file.path:
sample_id = file.path.split("/ASY_")[-1].split("/")[0]
# For each line in this file
file_name = file.path.split("/")[-1]
session.data_objects.get(file.path, file_name, **options)
sample_id = file.path.split("/A_")[-1].split("/")[0]
file_name = "./irods/" + sample_id + "_" + file.path.split("/")[-1]
# Download when not available
if not os.path.isfile(file_name):
session.data_objects.get(file.path, file_name, **options)
content = gzip.open(file_name, mode='r').readlines()
for line_file in content:
line_file = line_file.decode("UTF-8").strip()
# Skip function line
......@@ -112,6 +127,7 @@ def process_job_file():
print(line, file=output)
# Close and flush all writers
print("Closing all writers")
for writer in writers:
writer.close()
......@@ -129,7 +145,6 @@ def process_job_file():
df = df.pivot(index='X', columns='Y', values='Z')
df.to_csv(file, sep="\t", index_label=False )
metadata_file = "metadata.picrust.tsv"
remove_duplicates(metadata_file)
......@@ -177,18 +192,13 @@ def remove_duplicates(input_file):
output.close()
def process_rdf_files(rdf_file):
g = rdflib.Graph()
print("Processing rdf file", rdf_file)
g = Graph()
g.bind("unlock", Namespace("http://m-unlock.nl/ontology/"))
g.bind("schema", Namespace("http://schema.org/"))
g.parse(rdf_file, format="turtle")
output = open("metadata.picrust.tsv", "a")
qres = g.query("""
PREFIX gbol:<http://gbol.life/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ssb: <http://ssb.wur.nl/>
PREFIX ns1: <http://ssb.wur.nl/model/>
PREFIX unlock: <http://m-unlock.nl/ontology/>
PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?id ?predicate ?object
WHERE {
?assay ?predicate ?object .
......@@ -228,7 +238,7 @@ def process_rdf_files(rdf_file):
for row in qres:
predicate = "sample_" + row.predicate.split("/")[-1]
if type(row.predicate_label) == rdflib.term.Literal:
if type(row.predicate_label) == Literal:
predicate = "sample_" + row.predicate_label.replace(" ","_")
identifier = row.id
obj = row.object
......@@ -243,9 +253,6 @@ def process_rdf_files(rdf_file):
def tsv_to_biom(input_file):
from biom import load_table
from collections import OrderedDict
# Formatting
try:
result = pd.read_table(input_file)
......@@ -291,8 +298,6 @@ def tsv_to_biom(input_file):
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Biom file creation')
parser.add_argument('-j', '--job', help='input Job file', required=True)
parser.add_argument('-i', '--identifier', help='Prefix identifier', required=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment