biom creation for picrust data

59b50172 · Jasper Koehorst · 27198b8f · 59b50172
Commit 59b50172 authored 3 years ago by Jasper Koehorst
--- a/picrust_to_biom.py
+++ b/picrust_to_biom.py
+#!/usr/bin/python3
+
+__author__ = "Bart Nijsse and Jasper Koehorst"
+__copyright__ = "Copyright 2021, UNLOCK"
+__credits__ = ["Bart Nijsse", "Jasper Koehorst"]
+__license__ = "CC0"
+__version__ = "1.0.0"
+__status__ = "Development"
+
+import gzip
+import json
+from irods.session import iRODSSession
+import ssl
+import os
+import argparse
+from jedi.api import file_name
+import pandas as pd
+import rdflib
+import base64
+import irods.keywords as kw
+
+host = os.getenv('irodsHost')
+port = os.getenv('irodsPort')
+zone = os.getenv('irodsZone')
+user = os.getenv('irodsUserName')
+password = os.getenv('irodsPassword')
+
+# SSL settings
+context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
+
+ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
+                'irods_client_server_policy': 'CS_NEG_REQUIRE',
+                'irods_encryption_algorithm': 'AES-256-CBC',
+                'irods_encryption_key_size': 32,
+                'irods_encryption_num_hash_rounds': 16,
+                'irods_encryption_salt_size': 8,
+                'ssl_context': context}
+
+# Obtain the data for the biom files
+def process_job_file():
+    # Connect with irods
+    with iRODSSession(
+        host = host,
+        port = port,
+        user = user,
+        password = password,
+        zone = zone,
+        **ssl_settings) as session:
+
+        options = {
+            kw.FORCE_FLAG_KW: True,
+        }
+
+        # Obtain the job file
+        # job_file = session.data_objects.get(args.job)
+        file_names = ["combined_COG_pred_metagenome_unstrat.tsv", "combined_EC_pred_metagenome_unstrat.tsv", "combined_KO_pred_metagenome_unstrat.tsv", "combined_PFAM_pred_metagenome_unstrat.tsv", "combined_TIGRFAM_pred_metagenome_unstrat.tsv"]
+        
+        writers = []
+        for index, element in enumerate(file_names):
+            writer = open(file_names[index], "w")
+            writers.append(writer)
+        
+        # For line in the job file
+        for line in open(args.job):
+            line = line.strip() # .decode("UTF-8").strip()
+            print("Processing", line)
+            # Metadata file
+            if line.endswith(".ttl"):
+                rdf_file = session.data_objects.get(line)
+                name = rdf_file.path.split("/")[-1]
+                output = open(name, "w")
+                for line in rdf_file.open():
+                    print(line.decode("UTF-8").strip(), file=output)
+                output.close()
+                process_rdf_files(name)
+            else:
+                # List the files in this folder using the walk function
+                if not session.collections.exists(line): 
+                    print("Path does not exists", line)
+                    continue
+                walk = session.collections.get(line).walk()
+                # Just loop over the files as they are irods objects
+                for root, dirs, files in walk:
+                    # For each file in this directory / sub directories?
+                    for file in files:
+                        if "pred_metagenome_unstrat.tsv" in file.path:
+                            # Output object set to None
+                            output = None
+                            # Obtain output writer
+                            if "COG_metagenome_out" in file.path:
+                                output = writers[0]
+                            if "EC_metagenome_out" in file.path:
+                                output = writers[1]
+                            if "KO_metagenome_out" in file.path:
+                                output = writers[2]
+                            if "PFAM_metagenome_out" in file.path:
+                                output = writers[3]
+                            if "TIGRFAM_metagenome_out" in file.path:
+                                output = writers[4]
+                            # If no match was found output is still None
+                            if output == None: continue
+                            # For each line in this file
+                            file_name = file.path.split("/")[-1]
+                            session.data_objects.get(file.path, file_name, **options)
+                            sample_id = file.path.split("/A_")[-1].split("/")[0]
+                            content = gzip.open(file_name, mode='r').readlines()
+                            for line_file in content:
+                                line_file = line_file.decode("UTF-8").strip()
+                                # Skip function line
+                                if "function" in line_file: continue
+                                line = sample_id + "\t" + line_file
+                                print(line, file=output)
+
+        # Close and flush all writers
+        for writer in writers:
+            writer.close()
+
+        # Three columns to matrix method
+        for file in file_names:
+            content = []
+            with open(file) as reader:
+                for line in reader:
+                    line = line.strip().split()
+                    content.append(line)
+            df = pd.DataFrame(content, columns=['Y','X','Z'])
+            df = df.pivot(index='X', columns='Y', values='Z')
+            df.to_csv(file, sep="\t", index_label=False )
+    
+    metadata = "metadata.picrust.tsv"
+    content = []
+    with open(metadata) as reader:
+        lines = set(reader.readlines())
+        for line in lines:
+            line = line.strip().split("\t")
+            content.append(line)
+    df = pd.DataFrame(content, columns=['Y','X','Z'])
+    df = df.pivot(index='X', columns='Y', values='Z')
+    df = df.fillna("None")
+    df.to_csv(metadata, sep="\t", index_label=False)
+
+    # Create biom files
+    for file in file_names:
+        tsv_to_biom(file)
+    
+
+def process_rdf_files(rdf_file):
+    g = rdflib.Graph()
+    g.parse(rdf_file, format="turtle")
+    output = open("metadata.picrust.tsv", "a")
+
+    qres = g.query("""
+    PREFIX gbol:<http://gbol.life/0.1/>
+    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+    PREFIX ssb: <http://ssb.wur.nl/>
+    PREFIX ns1: <http://ssb.wur.nl/model/>
+    PREFIX unlock: <http://m-unlock.nl/ontology/>
+    PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
+    PREFIX schema: <http://schema.org/>
+    SELECT DISTINCT ?id ?predicate ?object
+    WHERE {
+        ?assay ?predicate ?object .
+        ?assay a unlock:AmpliconAssay .
+        ?assay schema:identifier ?id .
+        FILTER(!ISIRI(?object))
+    }""")
+
+    for row in qres:
+        predicate = "assay_"+row.predicate.split("/")[-1]
+        identifier = row.id
+        print(identifier + "\t"+predicate + f"\t{row.object}", file=output)
+
+    qres = g.query("""
+    PREFIX gbol:<http://gbol.life/0.1/>
+    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+    PREFIX ssb: <http://ssb.wur.nl/>
+    PREFIX ns1: <http://ssb.wur.nl/model/>
+    PREFIX unlock: <http://m-unlock.nl/ontology/>
+    PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
+    PREFIX schema: <http://schema.org/>
+    SELECT DISTINCT ?id ?predicate ?object ?predicate_label
+    WHERE {
+        ?sample a jerm:Sample .
+        ?sample ?predicate ?object .
+        OPTIONAL { ?predicate rdfs:label ?predicate_label}
+        ?sample jerm:hasPart ?assay .
+        ?assay a unlock:AmpliconAssay .
+        ?assay schema:identifier ?id .
+        FILTER(!ISIRI(?object))
+    }""")
+
+    for row in qres:
+        predicate = "sample_" + row.predicate.split("/")[-1]
+        if type(row.predicate_label) == rdflib.term.Literal:
+            predicate = "sample_" + row.predicate_label.replace(" ","_")
+        identifier = row.id
+        print(identifier + "\t"+predicate + f"\t{row.object}", file=output)
+
+
+def tsv_to_biom(input_file):
+    from biom import load_table
+    from collections import OrderedDict
+
+    # Formatting
+    result = pd.read_table(input_file)
+    result.fillna(0, inplace=True)
+    result = result.round(0)
+    result.to_csv(input_file, sep="\t")
+    # Transform to biom tsv format
+    lines = open(input_file).readlines()
+    lines.insert(0, "# Automatically generated input file for biom conversion")
+    lines[1] = "#OTU ID\t" + lines[1]
+    output = open(input_file, "w")
+    for line in lines:
+        print(line.strip(), file=output)
+
+    biom_content = load_table(input_file)
+    
+    # Load RDF metadata file
+    metadata = pd.read_table("metadata.picrust.tsv").to_dict()
+    # print(metadata.keys())
+
+    # Add metadata
+    biom_content.add_metadata(metadata)
+    biom_content.type = "Function table"
+    json_data = biom_content.to_json(generated_by="UNLOCK conversion module")
+    
+    # Create Python object from JSON string data
+    obj = json.loads(json_data)
+    
+    # TODO find out where this 0 element comes from (pandas?)
+    obj['columns'] = obj['columns'][1:]
+    for row in obj['rows']:
+        row['metadata'] = {"taxonomy":["k__NA", "p__NA", "c__NA", "o__NA", "f__NA", "g__NA", "s__NA"]}
+    obj['shape'][1] = int(obj['shape'][1]) - 1
+    # Pretty Print JSON
+    json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)
+    
+    biom_file = input_file.replace(".tsv", ".biom")
+    print("Writing biom file to", biom_file)
+    print(json_formatted_str, file=open(biom_file, "w"))
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Biom file creation')
+    parser.add_argument('-j', '--job', help='input Job file', required=True)
+
+    args = parser.parse_args()
+
+    job = args.job
+
+    if os.path.isfile("metadata.picrust.tsv"):
+        os.remove("metadata.picrust.tsv")
+
+    # Process the job file to create a biom json file
+    process_job_file()