Skip to content
Snippets Groups Projects
Commit 7490d6f6 authored by Koehorst, Jasper's avatar Koehorst, Jasper
Browse files

kraken prepare and local endpoint loader added. biom rmd updates

parent a0ad0ae9
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# SPARQL Endpoint loader
%% Cell type:code id: tags:
``` python
import getpass
print("Username")
username=input()
```
%% Output
Username
koehorst
%% Cell type:code id: tags:
``` python
print("Password")
password = getpass.getpass()
```
%% Output
Password
··············
%% Cell type:markdown id: tags:
### SPARQL database functions
%% Cell type:code id: tags:
``` python
def create_sparql_config(repo):
turtle = """
# RDF4J configuration template for a GraphDB Free repository
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix rep: <http://www.openrdf.org/config/repository#>.
@prefix sr: <http://www.openrdf.org/config/repository/sail#>.
@prefix sail: <http://www.openrdf.org/config/sail#>.
@prefix owlim: <http://www.ontotext.com/trree/owlim#>.
[] a rep:Repository ;
rep:repositoryID \""""+repo+"""\" ;
rdfs:label "GraphDB Free repository" ;
rep:repositoryImpl [
rep:repositoryType "graphdb:FreeSailRepository" ;
sr:sailImpl [
sail:sailType "graphdb:FreeSail" ;
owlim:base-URL "http://gbol.life/0.1/" ;
owlim:defaultNS "" ;
owlim:entity-index-size "10000000" ;
owlim:entity-id-size "32" ;
owlim:imports "" ;
owlim:repository-type "file-repository" ;
owlim:ruleset "rdfsplus-optimized" ;
owlim:storage-folder "storage" ;
owlim:enable-context-index "false" ;
owlim:enablePredicateList "true" ;
owlim:in-memory-literal-properties "true" ;
owlim:enable-literal-index "true" ;
owlim:check-for-inconsistencies "false" ;
owlim:disable-sameAs "false" ;
owlim:query-timeout "0" ;
owlim:query-limit-results "0" ;
owlim:throw-QueryEvaluationException-on-timeout "false" ;
owlim:read-only "false" ;
owlim:nonInterpretablePredicates "http://www.w3.org/2000/01/rdf-schema#label;http://www.w3.org/1999/02/22-rdf-syntax-ns#type;http://www.ontotext.com/owlim/ces#gazetteerConfig;http://www.ontotext.com/owlim/ces#metadataConfig" ;
]
].
"""
open(repo + "_config.ttl", "w").write(turtle)
logger.info("Config file created")
def curl(repo, filename):
url = 'http://graphdb:7200/repositories/' + repo + "/statements"
payload = open(filename,'rb').read()
headers = {'content-type': 'text/turtle'}
index = 0
while True:
index = index + 1
r = requests.post(url, data=payload, headers=headers, auth=HTTPBasicAuth(username, password), timeout=1000)
if int(r.status_code) == 200:
return True
if int(r.status_code) == 204:
# TODO
return True
if index > 2:
return r
if int(r.status_code) == 502:
return r
# if int(r.status_code) == 404:
# return r
print("Request post failed... exit code ",r.status_code," retrying ", index)
```
%% Cell type:markdown id: tags:
### iRODS dependencies
These dependencies are needed if not supplied from the docker file
%% Cell type:code id: tags:
``` python
import pathlib
# RDF Insertion
from SPARQLWrapper import SPARQLWrapper, JSON
# iRODS
from irods.session import iRODSSession
from irods.column import Criterion
from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta
from irods.session import iRODSSession
# Python
import os
import shutil
# Logging
import logging
# Authentication curl
import requests
from requests.auth import HTTPBasicAuth
LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'
def get_logger(name):
logging.basicConfig(format=LOGGING_FORMAT)
logger = logging.getLogger(name)
logger.setLevel('DEBUG')
return logger
logger = get_logger("jupyter")
# Configurations
%config IPCompleter.greedy=True
```
%% Cell type:markdown id: tags:
### iRODS connection
%% Cell type:code id: tags:
``` python
from irods.session import iRODSSession
import ssl
# iRODS authentication information
host = "unlock-icat.irods.surfsara.nl"
port = "1247"
zone = "unlock"
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
'irods_client_server_policy': 'CS_NEG_REQUIRE',
'irods_encryption_algorithm': 'AES-256-CBC',
'irods_encryption_key_size': 32,
'irods_encryption_num_hash_rounds': 16,
'irods_encryption_salt_size': 8,
'ssl_context': context}
session = iRODSSession(host = host,
port = port,
user = username,
password = password,
zone = zone,
**ssl_settings)
coll = session.collections.get('/unlock')
logger.info(coll)
```
%% Output
[2021-03-29 15:25:11,346][INFO ] <iRODSCollection 10005 b'unlock'>
%% Cell type:markdown id: tags:
### List projects with access
This section lists all projects the current user has access to. This can be used to set the `IDENTIFIER` to the right project.
%% Cell type:code id: tags:
``` python
projects = session.collections.get("/unlock/projects").subcollections
for project in projects:
logger.info(project.name)
for investigation in project.subcollections:
if investigation.name.startswith("I_"):
logger.info(project.name + "/" + investigation.name)
```
%% Output
[2021-03-29 15:25:11,386][INFO ] P_Deltares
[2021-03-29 15:25:11,407][INFO ] P_Deltares/I_vitens
[2021-03-29 15:25:11,408][INFO ] P_E2BN
[2021-03-29 15:25:11,424][INFO ] P_E2BN/I_PRED
[2021-03-29 15:25:11,425][INFO ] P_EXPLODIV
[2021-03-29 15:25:11,447][INFO ] P_EXPLODIV/I_BIOGAS
[2021-03-29 15:25:11,448][INFO ] P_EXPLODIV/I_UNCOUPLED
[2021-03-29 15:25:11,448][INFO ] P_FIRM-Project
[2021-03-29 15:25:11,467][INFO ] P_FIRM-Project/I_FIRM-Broilers
[2021-03-29 15:25:11,468][INFO ] P_MIB-Amplicon
[2021-03-29 15:25:11,484][INFO ] P_MIB-Amplicon/I_Mocks
[2021-03-29 15:25:11,485][INFO ] P_MIB-Amplicon/I_Poultry_16S_MIB
[2021-03-29 15:25:11,485][INFO ] P_SIAM
[2021-03-29 15:25:11,505][INFO ] P_SIAM/I_DbMM
[2021-03-29 15:25:11,506][INFO ] P_UNLOCK
[2021-03-29 15:25:11,528][INFO ] P_UNLOCK/I_CAMI
[2021-03-29 15:25:11,529][INFO ] P_UNLOCK/I_INVESTIGATION_TEST
[2021-03-29 15:25:11,530][INFO ] P_UNLOCK/I_SRA_Amplicon
%% Cell type:markdown id: tags:
### Downloading metadata rdf files
%% Cell type:code id: tags:
``` python
projects = session.collections.get("/unlock/projects").subcollections
for project in projects:
logger.info(project.name)
for investigation in project.subcollections:
if investigation.name.startswith("I_"):
for obj in investigation.data_objects:
if obj.name.endswith(".ttl"):
if obj.name.startswith("."): continue
local_path = "./data/" + investigation.path
destination = local_path + "/" + obj.name
os.makedirs(local_path, exist_ok=True)
options = {kw.FORCE_FLAG_KW:""}
session.data_objects.get(obj.path, destination, **options)
endpoint_name = project.name + "_" + investigation.name
create_sparql_config(endpoint_name)
# Execute config file to create repository
command = "curl -X POST graphdb:7200/rest/repositories -H 'Accept: application/json' -H 'Content-Type: multipart/form-data' -F config=@"+endpoint_name+"_config.ttl"
os.system(command)
# Load files
curl(endpoint_name, destination)
```
%% Output
[2021-03-29 15:25:11,666][INFO ] P_Deltares
[2021-03-29 15:25:12,294][INFO ] Config file created
[2021-03-29 15:25:13,300][INFO ] P_E2BN
[2021-03-29 15:25:15,387][INFO ] Config file created
[2021-03-29 15:25:49,299][INFO ] P_EXPLODIV
[2021-03-29 15:25:49,341][INFO ] P_FIRM-Project
[2021-03-29 15:25:49,372][INFO ] P_MIB-Amplicon
[2021-03-29 15:25:50,038][INFO ] Config file created
[2021-03-29 15:26:02,140][INFO ] Config file created
[2021-03-29 15:26:56,839][INFO ] Config file created
[2021-03-29 15:26:58,794][INFO ] P_SIAM
[2021-03-29 15:26:58,832][INFO ] P_UNLOCK
[2021-03-29 15:26:59,083][INFO ] Config file created
[2021-03-29 15:27:02,962][INFO ] Config file created
...@@ -52,7 +52,7 @@ WHERE { ...@@ -52,7 +52,7 @@ WHERE {
?ISAsample unlock:assay ?assay . ?ISAsample unlock:assay ?assay .
?assay schema:identifier ?id . ?assay schema:identifier ?id .
}" }"
results = SPARQL(url = "http://172.18.0.2:7200/repositories/P_E2BN_I_PRED", format = "csv", query = query)$results results = SPARQL(url = "http://graphdb:7200/repositories/P_E2BN_I_PRED", format = "csv", query = query)$results
results$predicate = gsub(x = results$predicate, pattern = ".*/", replacement = "") results$predicate = gsub(x = results$predicate, pattern = ".*/", replacement = "")
metadata <- dcast(results, id ~ predicate, value.var = "object") metadata <- dcast(results, id ~ predicate, value.var = "object")
rownames(metadata) = metadata$id rownames(metadata) = metadata$id
......
...@@ -21,7 +21,7 @@ The preview shows you a rendered HTML copy of the contents of the editor. Conseq ...@@ -21,7 +21,7 @@ The preview shows you a rendered HTML copy of the contents of the editor. Conseq
```{r} ```{r}
# Load Kraken json object generated from jupyter hub # Load Kraken json object generated from jupyter hub
physeq = import_biom("data/P_Deltares-I_vitens_kraken.json") physeq = import_biom("data/unlock/projects/P_Deltares/I_vitens/kraken.json")
# Fix rank naming issue from kraken data # Fix rank naming issue from kraken data
colnames(tax_table(physeq)) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") colnames(tax_table(physeq)) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
# Correct input names # Correct input names
......
%% Cell type:markdown id:typical-perfume tags:
# Generate kraken biom files
%% Cell type:code id:bridal-niger tags:
``` python
# Install kraken-biom package
!pip install kraken-biom
```
%% Cell type:code id:pharmaceutical-pattern tags:
``` python
# Dependencies
import os
import logging
from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta
from irods.column import Criterion
import getpass
from shutil import copyfileobj
import pathlib
```
%% Cell type:code id:right-allocation tags:
``` python
# Generic functions
LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'
def get_logger(name):
logging.basicConfig(format=LOGGING_FORMAT)
logger = logging.getLogger(name)
logger.setLevel('DEBUG')
return logger
logger = get_logger("jupyter")
```
%% Cell type:code id:minor-square tags:
``` python
## iRODS authentication
```
%% Cell type:code id:recognized-camcorder tags:
``` python
print("Username")
username=input()
```
%% Cell type:code id:designed-surveillance tags:
``` python
print("Password")
password = getpass.getpass()
```
%% Cell type:code id:flexible-sunglasses tags:
``` python
from irods.session import iRODSSession
import ssl
# iRODS authentication information
host = "unlock-icat.irods.surfsara.nl"
port = "1247"
zone = "unlock"
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
'irods_client_server_policy': 'CS_NEG_REQUIRE',
'irods_encryption_algorithm': 'AES-256-CBC',
'irods_encryption_key_size': 32,
'irods_encryption_num_hash_rounds': 16,
'irods_encryption_salt_size': 8,
'ssl_context': context}
session = iRODSSession(host = host,
port = port,
user = username,
password = password,
zone = zone,
**ssl_settings)
projects = session.collections.get("/unlock/projects").subcollections
for project in projects:
for investigation in project.subcollections:
if investigation.name.startswith("I_"):
logger.info("Access to project: " + project.name + " and investigation " + investigation.name)
```
%% Cell type:code id:surrounded-assurance tags:
``` python
# Obtain all kraken files for a given project and investigation
# project = "P_Deltares"
# investigation = "I_vitens"
for project in projects:
for investigation in project.subcollections:
if investigation.name.startswith("I_"):
logger.info("Access to project: " + project.name + " and investigation " + investigation.name)
results = session.query(Collection, DataObject).filter( \
Criterion('like', DataObject.path,"%_kraken2_report.txt")).filter( \
Criterion('like', Collection.name, investigation.path + "/%"))
file_paths = set()
# Obtaining all files
logger.info("Collecting all paths")
index = 0
for index, r in enumerate(results):
file_path = r.get(Collection.name) + "/" + r.get(DataObject.name)
obj = session.data_objects.get(file_path)
output_dir = './data' + r.get(Collection.name)
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
output_file = './data' + file_path
file_paths.add(output_file)
if os.path.isfile(output_file): continue
with open(output_file, 'wb') as output, obj.open('r+') as input:
copyfileobj(input, output)
# Skipp projects with no kraken files
if index == 0: continue
logger.info("Obtained " + str(index) + " kraken files")
# Creating biom file from kraken reports
command = "kraken-biom " + ' '.join(file_paths) + " --fmt json -o ./data/" + investigation.path + "/kraken.json"
logger.info("Command: " + command)
os.system(command)
```
%% Cell type:code id:comic-parker tags:
``` python
```
%% Cell type:code id:corporate-knowing tags:
``` python
```
%% Cell type:code id:matched-parallel tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment