ontology update

0e02a472 · Jasper Koehorst · 7a4418f2 · 0e02a472 · 0e02a472 · 0e02a472
Commit 0e02a472 authored 3 years ago by Jasper Koehorst
--- a/ngtax_to_biom.py
+++ b/ngtax_to_biom.py
@@ -45,18 +45,23 @@ def process_job_file():

        # Obtain the job file
        # job_file = session.data_objects.get(args.job)
-
+        
+        # Create store directory
+        if not os.path.isdir("irods"):
+            os.mkdir("irods")
+        
        destinations = {
            "seq.tsv" : open("seq.tsv", "w"), 
            "asv.tsv" : open("asv.tsv", "w"),
            "tax.tsv" : open("tax.tsv", "w"),
            "met.tsv" : open("met.tsv", "w")
            }
-        
+
        # For line in the job file
-        for line in open(args.job):
+        lines = open(args.job).readlines()
+        for index, line in enumerate(lines):
            line = line.strip() # .decode("UTF-8").strip()
-            print("Processing", line)
+            print("Processing", index,"of",len(lines), line)
            # Old version of job file had ttl project file but with latest workflow not needed
            if line.endswith(".ttl"): continue
            # List the files in this folder using the walk function
@@ -65,14 +70,19 @@ def process_job_file():
            for root, dirs, files in walk:
                # For each file in this directory / sub directories?
                for file in files:
+                    if os.path.exists("./irods/" + file.name): continue
                    # Extra check if there is no accidental file created in there e.g. .DS_Store
                    if file.path.split("_")[-1] in destinations:
-                        # Obtain output writer
-                        output = destinations[file.path.split("_")[-1]]
-                        # For each line in this file
-                        for line_file in file.open():
-                            line_file = line_file.decode("UTF-8").strip()
-                            print(line_file, file=output)
+                        # Download the file
+                        session.data_objects.get(file.path, "./irods/" + file.name)
+        
+        for file_name in os.listdir("./irods"):    
+            # Obtain output writer
+            output = destinations[file_name.split("_")[-1]]
+            for line_file in open("./irods/" + file_name):
+                line_file = line_file.strip()
+                print(line_file, file=output)
+
        # Close and flush all writers
        for writer in destinations:
            destinations[writer].close()
@@ -103,8 +113,8 @@ def biom_preformatter():
    df.index = asv_index

    for index, line in enumerate(lines):
-        if index % 1000 == 0:
-            print(index)
+        if index % 10000 == 0:
+            print(index, len(lines))
        sample, asv, value = line
        df.loc[asv, sample]=value


--- a/ngtax_to_tsv-fasta.py
+++ b/ngtax_to_tsv-fasta.py
@@ -153,7 +153,7 @@ def generate_met():
    SELECT DISTINCT ?id ?predicate ?object
    WHERE {
        ?assay ?predicate ?object .
-        ?assay a unlock:AmpliconAssay .
+        ?assay a jerm:Assay .
        ?assay schema:identifier ?id .
        FILTER(!ISIRI(?object))
    }""")
@@ -176,7 +176,7 @@ def generate_met():
        ?sample ?predicate ?object .
        OPTIONAL { ?predicate rdfs:label ?predicate_label}
        ?sample jerm:hasPart ?assay .
-        ?assay a unlock:AmpliconAssay .
+        ?assay a jerm:Assay .
        ?assay schema:identifier ?id .
        FILTER(!ISIRI(?object))
    }""")

--- a/picrust_to_biom.py
+++ b/picrust_to_biom.py
@@ -7,6 +7,7 @@ __license__ = "CC0"
 __version__ = "1.0.0"
 __status__ = "Development"

+import argparse
 import gzip
 import json
 from irods.session import iRODSSession
@@ -14,8 +15,12 @@ import ssl
 import os
 import argparse
 import pandas as pd
-import rdflib
+from rdflib.namespace import Namespace
+from rdflib import Graph
 import irods.keywords as kw
+from rdflib.term import Literal
+from biom import load_table
+from collections import OrderedDict

 host = os.getenv('irodsHost')
 port = os.getenv('irodsPort')
@@ -36,6 +41,9 @@ ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',

 # Obtain the data for the biom files
 def process_job_file():
+    if not os.path.isdir("./irods"):
+        os.mkdir("irods")
+
    # Connect with irods
    with iRODSSession(
        host = host,
@@ -59,18 +67,16 @@ def process_job_file():
            writers.append(writer)
        
        # For line in the job file
-        for line in open(args.job):
+        lines = open(args.job).readlines()
+        for index, line in enumerate(lines):
            line = line.strip() # .decode("UTF-8").strip()
-            print("Processing", line)
+            print("Processing", index, "of", len(lines), line)
            # Metadata file
            if line.endswith(".ttl"):
-                rdf_file = session.data_objects.get(line)
-                name = rdf_file.path.split("/")[-1]
-                output = open(name, "w")
-                for line in rdf_file.open():
-                    print(line.decode("UTF-8").strip(), file=output)
-                output.close()
-                process_rdf_files(name)
+                file_name = "./irods/" + line.split("/")[-1]
+                if not os.path.isfile(file_name):
+                    session.data_objects.get(line, file_name, **options)
+                process_rdf_files(file_name)
            else:
                # List the files in this folder using the walk function
                if not session.collections.exists(line): 
@@ -99,11 +105,20 @@ def process_job_file():
                                output = writers[5]
                            # If no match was found output is still None
                            if output == None: continue
+                            # Assay split
+                            if "/A_" in file.path:
+                                sample_id = file.path.split("/A_")[-1].split("/")[0]
+                            if "/ASY_" in file.path:
+                                sample_id = file.path.split("/ASY_")[-1].split("/")[0]
+
                            # For each line in this file
-                            file_name = file.path.split("/")[-1]
-                            session.data_objects.get(file.path, file_name, **options)
-                            sample_id = file.path.split("/A_")[-1].split("/")[0]
+                            file_name = "./irods/" + sample_id + "_" + file.path.split("/")[-1]
+                            # Download when not available
+                            if not os.path.isfile(file_name):
+                                session.data_objects.get(file.path, file_name, **options)
+
                            content = gzip.open(file_name, mode='r').readlines()
+
                            for line_file in content:
                                line_file = line_file.decode("UTF-8").strip()
                                # Skip function line
@@ -112,6 +127,7 @@ def process_job_file():
                                print(line, file=output)

        # Close and flush all writers
+        print("Closing all writers")
        for writer in writers:
            writer.close()

@@ -129,7 +145,6 @@ def process_job_file():
            df = df.pivot(index='X', columns='Y', values='Z')
            df.to_csv(file, sep="\t", index_label=False )
    
-
    metadata_file = "metadata.picrust.tsv"
    
    remove_duplicates(metadata_file)
@@ -177,18 +192,13 @@ def remove_duplicates(input_file):
    output.close()

 def process_rdf_files(rdf_file):
-    g = rdflib.Graph()
+    print("Processing rdf file", rdf_file)
+    g = Graph()
+    g.bind("unlock", Namespace("http://m-unlock.nl/ontology/"))
+    g.bind("schema", Namespace("http://schema.org/"))
    g.parse(rdf_file, format="turtle")
    output = open("metadata.picrust.tsv", "a")
-
    qres = g.query("""
-    PREFIX gbol:<http://gbol.life/0.1/>
-    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
-    PREFIX ssb: <http://ssb.wur.nl/>
-    PREFIX ns1: <http://ssb.wur.nl/model/>
-    PREFIX unlock: <http://m-unlock.nl/ontology/>
-    PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
-    PREFIX schema: <http://schema.org/>
    SELECT DISTINCT ?id ?predicate ?object
    WHERE {
        ?assay ?predicate ?object .
@@ -228,7 +238,7 @@ def process_rdf_files(rdf_file):

    for row in qres:
        predicate = "sample_" + row.predicate.split("/")[-1]
-        if type(row.predicate_label) == rdflib.term.Literal:
+        if type(row.predicate_label) == Literal:
            predicate = "sample_" + row.predicate_label.replace(" ","_")
        identifier = row.id
        obj = row.object
@@ -243,9 +253,6 @@ def process_rdf_files(rdf_file):


 def tsv_to_biom(input_file):
-    from biom import load_table
-    from collections import OrderedDict
-
    # Formatting
    try:
        result = pd.read_table(input_file)
@@ -291,8 +298,6 @@ def tsv_to_biom(input_file):


 if __name__ == "__main__":
-    import argparse
-
    parser = argparse.ArgumentParser(description='Biom file creation')
    parser.add_argument('-j', '--job', help='input Job file', required=True)
    parser.add_argument('-i', '--identifier', help='Prefix identifier', required=True)