Skip to content
Snippets Groups Projects
Commit 59b50172 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

biom creation for picrust data

parent 27198b8f
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python3
__author__ = "Bart Nijsse and Jasper Koehorst"
__copyright__ = "Copyright 2021, UNLOCK"
__credits__ = ["Bart Nijsse", "Jasper Koehorst"]
__license__ = "CC0"
__version__ = "1.0.0"
__status__ = "Development"
import gzip
import json
from irods.session import iRODSSession
import ssl
import os
import argparse
from jedi.api import file_name
import pandas as pd
import rdflib
import base64
import irods.keywords as kw
host = os.getenv('irodsHost')
port = os.getenv('irodsPort')
zone = os.getenv('irodsZone')
user = os.getenv('irodsUserName')
password = os.getenv('irodsPassword')
# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
'irods_client_server_policy': 'CS_NEG_REQUIRE',
'irods_encryption_algorithm': 'AES-256-CBC',
'irods_encryption_key_size': 32,
'irods_encryption_num_hash_rounds': 16,
'irods_encryption_salt_size': 8,
'ssl_context': context}
# Obtain the data for the biom files
def process_job_file():
# Connect with irods
with iRODSSession(
host = host,
port = port,
user = user,
password = password,
zone = zone,
**ssl_settings) as session:
options = {
kw.FORCE_FLAG_KW: True,
}
# Obtain the job file
# job_file = session.data_objects.get(args.job)
file_names = ["combined_COG_pred_metagenome_unstrat.tsv", "combined_EC_pred_metagenome_unstrat.tsv", "combined_KO_pred_metagenome_unstrat.tsv", "combined_PFAM_pred_metagenome_unstrat.tsv", "combined_TIGRFAM_pred_metagenome_unstrat.tsv"]
writers = []
for index, element in enumerate(file_names):
writer = open(file_names[index], "w")
writers.append(writer)
# For line in the job file
for line in open(args.job):
line = line.strip() # .decode("UTF-8").strip()
print("Processing", line)
# Metadata file
if line.endswith(".ttl"):
rdf_file = session.data_objects.get(line)
name = rdf_file.path.split("/")[-1]
output = open(name, "w")
for line in rdf_file.open():
print(line.decode("UTF-8").strip(), file=output)
output.close()
process_rdf_files(name)
else:
# List the files in this folder using the walk function
if not session.collections.exists(line):
print("Path does not exists", line)
continue
walk = session.collections.get(line).walk()
# Just loop over the files as they are irods objects
for root, dirs, files in walk:
# For each file in this directory / sub directories?
for file in files:
if "pred_metagenome_unstrat.tsv" in file.path:
# Output object set to None
output = None
# Obtain output writer
if "COG_metagenome_out" in file.path:
output = writers[0]
if "EC_metagenome_out" in file.path:
output = writers[1]
if "KO_metagenome_out" in file.path:
output = writers[2]
if "PFAM_metagenome_out" in file.path:
output = writers[3]
if "TIGRFAM_metagenome_out" in file.path:
output = writers[4]
# If no match was found output is still None
if output == None: continue
# For each line in this file
file_name = file.path.split("/")[-1]
session.data_objects.get(file.path, file_name, **options)
sample_id = file.path.split("/A_")[-1].split("/")[0]
content = gzip.open(file_name, mode='r').readlines()
for line_file in content:
line_file = line_file.decode("UTF-8").strip()
# Skip function line
if "function" in line_file: continue
line = sample_id + "\t" + line_file
print(line, file=output)
# Close and flush all writers
for writer in writers:
writer.close()
# Three columns to matrix method
for file in file_names:
content = []
with open(file) as reader:
for line in reader:
line = line.strip().split()
content.append(line)
df = pd.DataFrame(content, columns=['Y','X','Z'])
df = df.pivot(index='X', columns='Y', values='Z')
df.to_csv(file, sep="\t", index_label=False )
metadata = "metadata.picrust.tsv"
content = []
with open(metadata) as reader:
lines = set(reader.readlines())
for line in lines:
line = line.strip().split("\t")
content.append(line)
df = pd.DataFrame(content, columns=['Y','X','Z'])
df = df.pivot(index='X', columns='Y', values='Z')
df = df.fillna("None")
df.to_csv(metadata, sep="\t", index_label=False)
# Create biom files
for file in file_names:
tsv_to_biom(file)
def process_rdf_files(rdf_file):
g = rdflib.Graph()
g.parse(rdf_file, format="turtle")
output = open("metadata.picrust.tsv", "a")
qres = g.query("""
PREFIX gbol:<http://gbol.life/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ssb: <http://ssb.wur.nl/>
PREFIX ns1: <http://ssb.wur.nl/model/>
PREFIX unlock: <http://m-unlock.nl/ontology/>
PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?id ?predicate ?object
WHERE {
?assay ?predicate ?object .
?assay a unlock:AmpliconAssay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}""")
for row in qres:
predicate = "assay_"+row.predicate.split("/")[-1]
identifier = row.id
print(identifier + "\t"+predicate + f"\t{row.object}", file=output)
qres = g.query("""
PREFIX gbol:<http://gbol.life/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ssb: <http://ssb.wur.nl/>
PREFIX ns1: <http://ssb.wur.nl/model/>
PREFIX unlock: <http://m-unlock.nl/ontology/>
PREFIX jerm: <http://jermontology.org/ontology/JERMOntology#>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?id ?predicate ?object ?predicate_label
WHERE {
?sample a jerm:Sample .
?sample ?predicate ?object .
OPTIONAL { ?predicate rdfs:label ?predicate_label}
?sample jerm:hasPart ?assay .
?assay a unlock:AmpliconAssay .
?assay schema:identifier ?id .
FILTER(!ISIRI(?object))
}""")
for row in qres:
predicate = "sample_" + row.predicate.split("/")[-1]
if type(row.predicate_label) == rdflib.term.Literal:
predicate = "sample_" + row.predicate_label.replace(" ","_")
identifier = row.id
print(identifier + "\t"+predicate + f"\t{row.object}", file=output)
def tsv_to_biom(input_file):
from biom import load_table
from collections import OrderedDict
# Formatting
result = pd.read_table(input_file)
result.fillna(0, inplace=True)
result = result.round(0)
result.to_csv(input_file, sep="\t")
# Transform to biom tsv format
lines = open(input_file).readlines()
lines.insert(0, "# Automatically generated input file for biom conversion")
lines[1] = "#OTU ID\t" + lines[1]
output = open(input_file, "w")
for line in lines:
print(line.strip(), file=output)
biom_content = load_table(input_file)
# Load RDF metadata file
metadata = pd.read_table("metadata.picrust.tsv").to_dict()
# print(metadata.keys())
# Add metadata
biom_content.add_metadata(metadata)
biom_content.type = "Function table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")
# Create Python object from JSON string data
obj = json.loads(json_data)
# TODO find out where this 0 element comes from (pandas?)
obj['columns'] = obj['columns'][1:]
for row in obj['rows']:
row['metadata'] = {"taxonomy":["k__NA", "p__NA", "c__NA", "o__NA", "f__NA", "g__NA", "s__NA"]}
obj['shape'][1] = int(obj['shape'][1]) - 1
# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)
biom_file = input_file.replace(".tsv", ".biom")
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Biom file creation')
parser.add_argument('-j', '--job', help='input Job file', required=True)
args = parser.parse_args()
job = args.job
if os.path.isfile("metadata.picrust.tsv"):
os.remove("metadata.picrust.tsv")
# Process the job file to create a biom json file
process_job_file()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment