kraken prepare and local endpoint loader added. biom rmd updates

7490d6f6 · Koehorst, Jasper · a0ad0ae9 · 7490d6f6 · 7490d6f6 · 7490d6f6
Commit 7490d6f6 authored 4 years ago by Koehorst, Jasper
--- a/notebooks/SPARQLEndpointLoader.ipynb
+++ b/notebooks/SPARQLEndpointLoader.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SPARQL Endpoint loader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Username\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " koehorst\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "print(\"Username\")\n",
+    "username=input()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Password\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ··············\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Password\")\n",
+    "password = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SPARQL database functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_sparql_config(repo):\n",
+    "    turtle = \"\"\"\n",
+    "    # RDF4J configuration template for a GraphDB Free repository\n",
+    "\n",
+    "    @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.\n",
+    "    @prefix rep: <http://www.openrdf.org/config/repository#>.\n",
+    "    @prefix sr: <http://www.openrdf.org/config/repository/sail#>.\n",
+    "    @prefix sail: <http://www.openrdf.org/config/sail#>.\n",
+    "    @prefix owlim: <http://www.ontotext.com/trree/owlim#>.\n",
+    "\n",
+    "    [] a rep:Repository ;\n",
+    "        rep:repositoryID \\\"\"\"\"+repo+\"\"\"\\\" ;\n",
+    "        rdfs:label \"GraphDB Free repository\" ;\n",
+    "        rep:repositoryImpl [\n",
+    "            rep:repositoryType \"graphdb:FreeSailRepository\" ;\n",
+    "            sr:sailImpl [\n",
+    "                sail:sailType \"graphdb:FreeSail\" ;\n",
+    "\n",
+    "                owlim:base-URL \"http://gbol.life/0.1/\" ;\n",
+    "                owlim:defaultNS \"\" ;\n",
+    "                owlim:entity-index-size \"10000000\" ;\n",
+    "                owlim:entity-id-size  \"32\" ;\n",
+    "                owlim:imports \"\" ;\n",
+    "                owlim:repository-type \"file-repository\" ;\n",
+    "                owlim:ruleset \"rdfsplus-optimized\" ;\n",
+    "                owlim:storage-folder \"storage\" ;\n",
+    "\n",
+    "                owlim:enable-context-index \"false\" ;\n",
+    "\n",
+    "                owlim:enablePredicateList \"true\" ;\n",
+    "\n",
+    "                owlim:in-memory-literal-properties \"true\" ;\n",
+    "                owlim:enable-literal-index \"true\" ;\n",
+    "\n",
+    "                owlim:check-for-inconsistencies \"false\" ;\n",
+    "                owlim:disable-sameAs  \"false\" ;\n",
+    "                owlim:query-timeout  \"0\" ;\n",
+    "                owlim:query-limit-results  \"0\" ;\n",
+    "                owlim:throw-QueryEvaluationException-on-timeout \"false\" ;\n",
+    "                owlim:read-only \"false\" ;\n",
+    "                owlim:nonInterpretablePredicates \"http://www.w3.org/2000/01/rdf-schema#label;http://www.w3.org/1999/02/22-rdf-syntax-ns#type;http://www.ontotext.com/owlim/ces#gazetteerConfig;http://www.ontotext.com/owlim/ces#metadataConfig\" ;\n",
+    "            ]\n",
+    "        ].\n",
+    "        \"\"\"\n",
+    "\n",
+    "    open(repo + \"_config.ttl\", \"w\").write(turtle)\n",
+    "    logger.info(\"Config file created\")\n",
+    "\n",
+    "\n",
+    "def curl(repo, filename):\n",
+    "    url = 'http://graphdb:7200/repositories/' + repo + \"/statements\"\n",
+    "    payload = open(filename,'rb').read()\n",
+    "    headers = {'content-type': 'text/turtle'}\n",
+    "    index = 0\n",
+    "    while True:\n",
+    "        index = index + 1\n",
+    "        r = requests.post(url, data=payload, headers=headers, auth=HTTPBasicAuth(username, password), timeout=1000)\n",
+    "        if int(r.status_code) == 200:\n",
+    "            return True\n",
+    "        if int(r.status_code) == 204:\n",
+    "            # TODO\n",
+    "            return True\n",
+    "        if index > 2:\n",
+    "            return r\n",
+    "        if int(r.status_code) == 502:\n",
+    "            return r\n",
+    "#         if int(r.status_code) == 404:\n",
+    "#             return r\n",
+    "        print(\"Request post failed... exit code \",r.status_code,\" retrying \", index)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### iRODS dependencies \n",
+    "These dependencies are needed if not supplied from the docker file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "# RDF Insertion\n",
+    "from SPARQLWrapper import SPARQLWrapper, JSON\n",
+    "# iRODS\n",
+    "from irods.session import iRODSSession\n",
+    "from irods.column import Criterion\n",
+    "from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta\n",
+    "from irods.session import iRODSSession\n",
+    "# Python\n",
+    "import os\n",
+    "import shutil\n",
+    "# Logging\n",
+    "import logging\n",
+    "# Authentication curl\n",
+    "import requests\n",
+    "from requests.auth import HTTPBasicAuth\n",
+    "\n",
+    "LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'\n",
+    "\n",
+    "def get_logger(name):\n",
+    "    logging.basicConfig(format=LOGGING_FORMAT)\n",
+    "    logger = logging.getLogger(name)\n",
+    "    logger.setLevel('DEBUG')\n",
+    "    return logger\n",
+    "\n",
+    "logger = get_logger(\"jupyter\")\n",
+    "# Configurations\n",
+    "%config IPCompleter.greedy=True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### iRODS connection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2021-03-29 15:25:11,346][INFO   ] <iRODSCollection 10005 b'unlock'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from irods.session import iRODSSession\n",
+    "import ssl\n",
+    "\n",
+    "# iRODS authentication information\n",
+    "host = \"unlock-icat.irods.surfsara.nl\"\n",
+    "port = \"1247\"\n",
+    "zone = \"unlock\"\n",
+    "\n",
+    "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n",
+    "\n",
+    "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n",
+    "                'irods_client_server_policy': 'CS_NEG_REQUIRE',\n",
+    "                'irods_encryption_algorithm': 'AES-256-CBC',\n",
+    "                'irods_encryption_key_size': 32,\n",
+    "                'irods_encryption_num_hash_rounds': 16,\n",
+    "                'irods_encryption_salt_size': 8,\n",
+    "                'ssl_context': context}\n",
+    "\n",
+    "\n",
+    "session = iRODSSession(host = host,\n",
+    "                  port = port,\n",
+    "                  user = username,\n",
+    "                  password = password,\n",
+    "                  zone = zone,\n",
+    "                  **ssl_settings)\n",
+    "\n",
+    "coll = session.collections.get('/unlock')\n",
+    "logger.info(coll)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### List projects with access\n",
+    "This section lists all projects the current user has access to. This can be used to set the `IDENTIFIER` to the right project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2021-03-29 15:25:11,386][INFO   ] P_Deltares\n",
+      "[2021-03-29 15:25:11,407][INFO   ] P_Deltares/I_vitens\n",
+      "[2021-03-29 15:25:11,408][INFO   ] P_E2BN\n",
+      "[2021-03-29 15:25:11,424][INFO   ] P_E2BN/I_PRED\n",
+      "[2021-03-29 15:25:11,425][INFO   ] P_EXPLODIV\n",
+      "[2021-03-29 15:25:11,447][INFO   ] P_EXPLODIV/I_BIOGAS\n",
+      "[2021-03-29 15:25:11,448][INFO   ] P_EXPLODIV/I_UNCOUPLED\n",
+      "[2021-03-29 15:25:11,448][INFO   ] P_FIRM-Project\n",
+      "[2021-03-29 15:25:11,467][INFO   ] P_FIRM-Project/I_FIRM-Broilers\n",
+      "[2021-03-29 15:25:11,468][INFO   ] P_MIB-Amplicon\n",
+      "[2021-03-29 15:25:11,484][INFO   ] P_MIB-Amplicon/I_Mocks\n",
+      "[2021-03-29 15:25:11,485][INFO   ] P_MIB-Amplicon/I_Poultry_16S_MIB\n",
+      "[2021-03-29 15:25:11,485][INFO   ] P_SIAM\n",
+      "[2021-03-29 15:25:11,505][INFO   ] P_SIAM/I_DbMM\n",
+      "[2021-03-29 15:25:11,506][INFO   ] P_UNLOCK\n",
+      "[2021-03-29 15:25:11,528][INFO   ] P_UNLOCK/I_CAMI\n",
+      "[2021-03-29 15:25:11,529][INFO   ] P_UNLOCK/I_INVESTIGATION_TEST\n",
+      "[2021-03-29 15:25:11,530][INFO   ] P_UNLOCK/I_SRA_Amplicon\n"
+     ]
+    }
+   ],
+   "source": [
+    "projects = session.collections.get(\"/unlock/projects\").subcollections\n",
+    "for project in projects:\n",
+    "    logger.info(project.name)\n",
+    "    for investigation in project.subcollections:\n",
+    "        if investigation.name.startswith(\"I_\"):\n",
+    "            logger.info(project.name + \"/\" + investigation.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Downloading metadata rdf files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2021-03-29 15:25:11,666][INFO   ] P_Deltares\n",
+      "[2021-03-29 15:25:12,294][INFO   ] Config file created\n",
+      "[2021-03-29 15:25:13,300][INFO   ] P_E2BN\n",
+      "[2021-03-29 15:25:15,387][INFO   ] Config file created\n",
+      "[2021-03-29 15:25:49,299][INFO   ] P_EXPLODIV\n",
+      "[2021-03-29 15:25:49,341][INFO   ] P_FIRM-Project\n",
+      "[2021-03-29 15:25:49,372][INFO   ] P_MIB-Amplicon\n",
+      "[2021-03-29 15:25:50,038][INFO   ] Config file created\n",
+      "[2021-03-29 15:26:02,140][INFO   ] Config file created\n",
+      "[2021-03-29 15:26:56,839][INFO   ] Config file created\n",
+      "[2021-03-29 15:26:58,794][INFO   ] P_SIAM\n",
+      "[2021-03-29 15:26:58,832][INFO   ] P_UNLOCK\n",
+      "[2021-03-29 15:26:59,083][INFO   ] Config file created\n",
+      "[2021-03-29 15:27:02,962][INFO   ] Config file created\n"
+     ]
+    }
+   ],
+   "source": [
+    "projects = session.collections.get(\"/unlock/projects\").subcollections\n",
+    "for project in projects:\n",
+    "    logger.info(project.name)\n",
+    "    for investigation in project.subcollections:\n",
+    "        if investigation.name.startswith(\"I_\"):\n",
+    "            for obj in investigation.data_objects:\n",
+    "                if obj.name.endswith(\".ttl\"):\n",
+    "                    if obj.name.startswith(\".\"): continue\n",
+    "                    local_path = \"./data/\" + investigation.path\n",
+    "                    destination = local_path + \"/\" + obj.name\n",
+    "                    os.makedirs(local_path, exist_ok=True)\n",
+    "                    options = {kw.FORCE_FLAG_KW:\"\"}\n",
+    "                    session.data_objects.get(obj.path, destination, **options)\n",
+    "                    endpoint_name = project.name + \"_\" + investigation.name\n",
+    "                    create_sparql_config(endpoint_name)\n",
+    "                    # Execute config file to create repository\n",
+    "                    command = \"curl -X POST graphdb:7200/rest/repositories -H 'Accept: application/json' -H 'Content-Type: multipart/form-data' -F config=@\"+endpoint_name+\"_config.ttl\"\n",
+    "                    os.system(command)\n",
+    "                    # Load files\n",
+    "                    curl(endpoint_name, destination)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": false,
+   "sideBar": false,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": false,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+# SPARQL Endpoint loader
+%% Cell type:code id: tags:
+``` python
+import getpass
+print("Username")
+username=input()
+```
+%% Output
+    Username
+     koehorst
+%% Cell type:code id: tags:
+``` python
+print("Password")
+password = getpass.getpass()
+```
+%% Output
+    Password
+     ··············
+%% Cell type:markdown id: tags:
+### SPARQL database functions
+%% Cell type:code id: tags:
+``` python
+def create_sparql_config(repo):
+    turtle = """
+    # RDF4J configuration template for a GraphDB Free repository
+    @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
+    @prefix rep: <http://www.openrdf.org/config/repository#>.
+    @prefix sr: <http://www.openrdf.org/config/repository/sail#>.
+    @prefix sail: <http://www.openrdf.org/config/sail#>.
+    @prefix owlim: <http://www.ontotext.com/trree/owlim#>.
+    [] a rep:Repository ;
+        rep:repositoryID \""""+repo+"""\" ;
+        rdfs:label "GraphDB Free repository" ;
+        rep:repositoryImpl [
+            rep:repositoryType "graphdb:FreeSailRepository" ;
+            sr:sailImpl [
+                sail:sailType "graphdb:FreeSail" ;
+                owlim:base-URL "http://gbol.life/0.1/" ;
+                owlim:defaultNS "" ;
+                owlim:entity-index-size "10000000" ;
+                owlim:entity-id-size  "32" ;
+                owlim:imports "" ;
+                owlim:repository-type "file-repository" ;
+                owlim:ruleset "rdfsplus-optimized" ;
+                owlim:storage-folder "storage" ;
+                owlim:enable-context-index "false" ;
+                owlim:enablePredicateList "true" ;
+                owlim:in-memory-literal-properties "true" ;
+                owlim:enable-literal-index "true" ;
+                owlim:check-for-inconsistencies "false" ;
+                owlim:disable-sameAs  "false" ;
+                owlim:query-timeout  "0" ;
+                owlim:query-limit-results  "0" ;
+                owlim:throw-QueryEvaluationException-on-timeout "false" ;
+                owlim:read-only "false" ;
+                owlim:nonInterpretablePredicates "http://www.w3.org/2000/01/rdf-schema#label;http://www.w3.org/1999/02/22-rdf-syntax-ns#type;http://www.ontotext.com/owlim/ces#gazetteerConfig;http://www.ontotext.com/owlim/ces#metadataConfig" ;
+            ]
+        ].
+        """
+    open(repo + "_config.ttl", "w").write(turtle)
+    logger.info("Config file created")
+def curl(repo, filename):
+    url = 'http://graphdb:7200/repositories/' + repo + "/statements"
+    payload = open(filename,'rb').read()
+    headers = {'content-type': 'text/turtle'}
+    index = 0
+    while True:
+        index = index + 1
+        r = requests.post(url, data=payload, headers=headers, auth=HTTPBasicAuth(username, password), timeout=1000)
+        if int(r.status_code) == 200:
+            return True
+        if int(r.status_code) == 204:
+            # TODO
+            return True
+        if index > 2:
+            return r
+        if int(r.status_code) == 502:
+            return r
+#         if int(r.status_code) == 404:
+#             return r
+        print("Request post failed... exit code ",r.status_code," retrying ", index)
+```
+%% Cell type:markdown id: tags:
+### iRODS dependencies
+These dependencies are needed if not supplied from the docker file
+%% Cell type:code id: tags:
+``` python
+import pathlib
+# RDF Insertion
+from SPARQLWrapper import SPARQLWrapper, JSON
+# iRODS
+from irods.session import iRODSSession
+from irods.column import Criterion
+from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta
+from irods.session import iRODSSession
+# Python
+import os
+import shutil
+# Logging
+import logging
+# Authentication curl
+import requests
+from requests.auth import HTTPBasicAuth
+LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'
+def get_logger(name):
+    logging.basicConfig(format=LOGGING_FORMAT)
+    logger = logging.getLogger(name)
+    logger.setLevel('DEBUG')
+    return logger
+logger = get_logger("jupyter")
+# Configurations
+%config IPCompleter.greedy=True
+```
+%% Cell type:markdown id: tags:
+### iRODS connection
+%% Cell type:code id: tags:
+``` python
+from irods.session import iRODSSession
+import ssl
+# iRODS authentication information
+host = "unlock-icat.irods.surfsara.nl"
+port = "1247"
+zone = "unlock"
+context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
+ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
+                'irods_client_server_policy': 'CS_NEG_REQUIRE',
+                'irods_encryption_algorithm': 'AES-256-CBC',
+                'irods_encryption_key_size': 32,
+                'irods_encryption_num_hash_rounds': 16,
+                'irods_encryption_salt_size': 8,
+                'ssl_context': context}
+session = iRODSSession(host = host,
+                  port = port,
+                  user = username,
+                  password = password,
+                  zone = zone,
+                  **ssl_settings)
+coll = session.collections.get('/unlock')
+logger.info(coll)
+```
+%% Output
+    [2021-03-29 15:25:11,346][INFO   ] <iRODSCollection 10005 b'unlock'>
+%% Cell type:markdown id: tags:
+### List projects with access
+This section lists all projects the current user has access to. This can be used to set the `IDENTIFIER` to the right project.
+%% Cell type:code id: tags:
+``` python
+projects = session.collections.get("/unlock/projects").subcollections
+for project in projects:
+    logger.info(project.name)
+    for investigation in project.subcollections:
+        if investigation.name.startswith("I_"):
+            logger.info(project.name + "/" + investigation.name)
+```
+%% Output
+    [2021-03-29 15:25:11,386][INFO   ] P_Deltares
+    [2021-03-29 15:25:11,407][INFO   ] P_Deltares/I_vitens
+    [2021-03-29 15:25:11,408][INFO   ] P_E2BN
+    [2021-03-29 15:25:11,424][INFO   ] P_E2BN/I_PRED
+    [2021-03-29 15:25:11,425][INFO   ] P_EXPLODIV
+    [2021-03-29 15:25:11,447][INFO   ] P_EXPLODIV/I_BIOGAS
+    [2021-03-29 15:25:11,448][INFO   ] P_EXPLODIV/I_UNCOUPLED
+    [2021-03-29 15:25:11,448][INFO   ] P_FIRM-Project
+    [2021-03-29 15:25:11,467][INFO   ] P_FIRM-Project/I_FIRM-Broilers
+    [2021-03-29 15:25:11,468][INFO   ] P_MIB-Amplicon
+    [2021-03-29 15:25:11,484][INFO   ] P_MIB-Amplicon/I_Mocks
+    [2021-03-29 15:25:11,485][INFO   ] P_MIB-Amplicon/I_Poultry_16S_MIB
+    [2021-03-29 15:25:11,485][INFO   ] P_SIAM
+    [2021-03-29 15:25:11,505][INFO   ] P_SIAM/I_DbMM
+    [2021-03-29 15:25:11,506][INFO   ] P_UNLOCK
+    [2021-03-29 15:25:11,528][INFO   ] P_UNLOCK/I_CAMI
+    [2021-03-29 15:25:11,529][INFO   ] P_UNLOCK/I_INVESTIGATION_TEST
+    [2021-03-29 15:25:11,530][INFO   ] P_UNLOCK/I_SRA_Amplicon
+%% Cell type:markdown id: tags:
+### Downloading metadata rdf files
+%% Cell type:code id: tags:
+``` python
+projects = session.collections.get("/unlock/projects").subcollections
+for project in projects:
+    logger.info(project.name)
+    for investigation in project.subcollections:
+        if investigation.name.startswith("I_"):
+            for obj in investigation.data_objects:
+                if obj.name.endswith(".ttl"):
+                    if obj.name.startswith("."): continue
+                    local_path = "./data/" + investigation.path
+                    destination = local_path + "/" + obj.name
+                    os.makedirs(local_path, exist_ok=True)
+                    options = {kw.FORCE_FLAG_KW:""}
+                    session.data_objects.get(obj.path, destination, **options)
+                    endpoint_name = project.name + "_" + investigation.name
+                    create_sparql_config(endpoint_name)
+                    # Execute config file to create repository
+                    command = "curl -X POST graphdb:7200/rest/repositories -H 'Accept: application/json' -H 'Content-Type: multipart/form-data' -F config=@"+endpoint_name+"_config.ttl"
+                    os.system(command)
+                    # Load files
+                    curl(endpoint_name, destination)
+```
+%% Output
+    [2021-03-29 15:25:11,666][INFO   ] P_Deltares
+    [2021-03-29 15:25:12,294][INFO   ] Config file created
+    [2021-03-29 15:25:13,300][INFO   ] P_E2BN
+    [2021-03-29 15:25:15,387][INFO   ] Config file created
+    [2021-03-29 15:25:49,299][INFO   ] P_EXPLODIV
+    [2021-03-29 15:25:49,341][INFO   ] P_FIRM-Project
+    [2021-03-29 15:25:49,372][INFO   ] P_MIB-Amplicon
+    [2021-03-29 15:25:50,038][INFO   ] Config file created
+    [2021-03-29 15:26:02,140][INFO   ] Config file created
+    [2021-03-29 15:26:56,839][INFO   ] Config file created
+    [2021-03-29 15:26:58,794][INFO   ] P_SIAM
+    [2021-03-29 15:26:58,832][INFO   ] P_UNLOCK
+    [2021-03-29 15:26:59,083][INFO   ] Config file created
+    [2021-03-29 15:27:02,962][INFO   ] Config file created
--- a/notebooks/amplicon-biom.Rmd
+++ b/notebooks/amplicon-biom.Rmd
@@ -52,7 +52,7 @@ WHERE {
    ?ISAsample unlock:assay ?assay .
    ?assay schema:identifier ?id .
 }"
-results = SPARQL(url = "http://172.18.0.2:7200/repositories/P_E2BN_I_PRED", format = "csv", query = query)$results
+results = SPARQL(url = "http://graphdb:7200/repositories/P_E2BN_I_PRED", format = "csv", query = query)$results
 results$predicate = gsub(x = results$predicate, pattern = ".*/", replacement = "")
 metadata <- dcast(results, id ~ predicate, value.var = "object")
 rownames(metadata) = metadata$id

--- a/notebooks/kraken-biom.Rmd
+++ b/notebooks/kraken-biom.Rmd
@@ -21,7 +21,7 @@ The preview shows you a rendered HTML copy of the contents of the editor. Conseq
 ```{r}
 # Load Kraken json object generated from jupyter hub
-physeq = import_biom("data/P_Deltares-I_vitens_kraken.json")
+physeq = import_biom("data/unlock/projects/P_Deltares/I_vitens/kraken.json")
 # Fix rank naming issue from kraken data
 colnames(tax_table(physeq)) = c("Kingdom", "Phylum",  "Class",   "Order",   "Family",  "Genus",   "Species")
 # Correct input names

--- a/notebooks/kraken-prepare.ipynb
+++ b/notebooks/kraken-prepare.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "typical-perfume",
+   "metadata": {},
+   "source": [
+    "# Generate kraken biom files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bridal-niger",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install kraken-biom package\n",
+    "!pip install kraken-biom"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pharmaceutical-pattern",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dependencies\n",
+    "import os\n",
+    "import logging\n",
+    "from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta\n",
+    "from irods.column import Criterion\n",
+    "import getpass\n",
+    "from shutil import copyfileobj\n",
+    "import pathlib\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "right-allocation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generic functions\n",
+    "LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'\n",
+    "\n",
+    "def get_logger(name):\n",
+    "    logging.basicConfig(format=LOGGING_FORMAT)\n",
+    "    logger = logging.getLogger(name)\n",
+    "    logger.setLevel('DEBUG')\n",
+    "    return logger\n",
+    "\n",
+    "logger = get_logger(\"jupyter\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "minor-square",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## iRODS authentication"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "recognized-camcorder",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Username\")\n",
+    "username=input()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "designed-surveillance",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Password\")\n",
+    "password = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "flexible-sunglasses",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from irods.session import iRODSSession\n",
+    "import ssl\n",
+    "\n",
+    "# iRODS authentication information\n",
+    "host = \"unlock-icat.irods.surfsara.nl\"\n",
+    "port = \"1247\"\n",
+    "zone = \"unlock\"\n",
+    "\n",
+    "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n",
+    "\n",
+    "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n",
+    "                'irods_client_server_policy': 'CS_NEG_REQUIRE',\n",
+    "                'irods_encryption_algorithm': 'AES-256-CBC',\n",
+    "                'irods_encryption_key_size': 32,\n",
+    "                'irods_encryption_num_hash_rounds': 16,\n",
+    "                'irods_encryption_salt_size': 8,\n",
+    "                'ssl_context': context}\n",
+    "\n",
+    "\n",
+    "session = iRODSSession(host = host,\n",
+    "                  port = port,\n",
+    "                  user = username,\n",
+    "                  password = password,\n",
+    "                  zone = zone,\n",
+    "                  **ssl_settings)\n",
+    "\n",
+    "projects = session.collections.get(\"/unlock/projects\").subcollections\n",
+    "for project in projects:\n",
+    "    for investigation in project.subcollections:\n",
+    "        if investigation.name.startswith(\"I_\"):\n",
+    "            logger.info(\"Access to project: \" + project.name + \" and investigation \" + investigation.name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "surrounded-assurance",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Obtain all kraken files for a given project and investigation\n",
+    "# project = \"P_Deltares\"\n",
+    "# investigation = \"I_vitens\"\n",
+    "for project in projects:\n",
+    "    for investigation in project.subcollections:\n",
+    "        if investigation.name.startswith(\"I_\"):\n",
+    "            logger.info(\"Access to project: \" + project.name + \" and investigation \" + investigation.name)\n",
+    "            \n",
+    "            results = session.query(Collection, DataObject).filter( \\\n",
+    "                Criterion('like', DataObject.path,\"%_kraken2_report.txt\")).filter( \\\n",
+    "                Criterion('like', Collection.name, investigation.path + \"/%\"))\n",
+    "\n",
+    "            file_paths = set()\n",
+    "\n",
+    "            # Obtaining all files\n",
+    "            logger.info(\"Collecting all paths\")\n",
+    "            index = 0\n",
+    "            for index, r in enumerate(results):\n",
+    "                file_path = r.get(Collection.name) + \"/\" + r.get(DataObject.name)\n",
+    "                obj = session.data_objects.get(file_path)\n",
+    "                output_dir = './data' + r.get(Collection.name)\n",
+    "                pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)\n",
+    "                output_file = './data' + file_path\n",
+    "                file_paths.add(output_file)\n",
+    "                if os.path.isfile(output_file): continue\n",
+    "                with open(output_file, 'wb') as output, obj.open('r+') as input:\n",
+    "                    copyfileobj(input, output)\n",
+    "\n",
+    "            # Skipp projects with no kraken files\n",
+    "            if index == 0: continue\n",
+    "                \n",
+    "            logger.info(\"Obtained \" + str(index) + \" kraken files\")\n",
+    "            \n",
+    "            # Creating biom file from kraken reports\n",
+    "            command = \"kraken-biom \" + ' '.join(file_paths) + \" --fmt json -o ./data/\" + investigation.path + \"/kraken.json\"\n",
+    "            logger.info(\"Command: \" + command)\n",
+    "            os.system(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "comic-parker",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "corporate-knowing",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "matched-parallel",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:typical-perfume tags:
+# Generate kraken biom files
+%% Cell type:code id:bridal-niger tags:
+``` python
+# Install kraken-biom package
+!pip install kraken-biom
+```
+%% Cell type:code id:pharmaceutical-pattern tags:
+``` python
+# Dependencies
+import os
+import logging
+from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta
+from irods.column import Criterion
+import getpass
+from shutil import copyfileobj
+import pathlib
+```
+%% Cell type:code id:right-allocation tags:
+``` python
+# Generic functions
+LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'
+def get_logger(name):
+    logging.basicConfig(format=LOGGING_FORMAT)
+    logger = logging.getLogger(name)
+    logger.setLevel('DEBUG')
+    return logger
+logger = get_logger("jupyter")
+```
+%% Cell type:code id:minor-square tags:
+``` python
+## iRODS authentication
+```
+%% Cell type:code id:recognized-camcorder tags:
+``` python
+print("Username")
+username=input()
+```
+%% Cell type:code id:designed-surveillance tags:
+``` python
+print("Password")
+password = getpass.getpass()
+```
+%% Cell type:code id:flexible-sunglasses tags:
+``` python
+from irods.session import iRODSSession
+import ssl
+# iRODS authentication information
+host = "unlock-icat.irods.surfsara.nl"
+port = "1247"
+zone = "unlock"
+context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
+ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
+                'irods_client_server_policy': 'CS_NEG_REQUIRE',
+                'irods_encryption_algorithm': 'AES-256-CBC',
+                'irods_encryption_key_size': 32,
+                'irods_encryption_num_hash_rounds': 16,
+                'irods_encryption_salt_size': 8,
+                'ssl_context': context}
+session = iRODSSession(host = host,
+                  port = port,
+                  user = username,
+                  password = password,
+                  zone = zone,
+                  **ssl_settings)
+projects = session.collections.get("/unlock/projects").subcollections
+for project in projects:
+    for investigation in project.subcollections:
+        if investigation.name.startswith("I_"):
+            logger.info("Access to project: " + project.name + " and investigation " + investigation.name)
+```
+%% Cell type:code id:surrounded-assurance tags:
+``` python
+# Obtain all kraken files for a given project and investigation
+# project = "P_Deltares"
+# investigation = "I_vitens"
+for project in projects:
+    for investigation in project.subcollections:
+        if investigation.name.startswith("I_"):
+            logger.info("Access to project: " + project.name + " and investigation " + investigation.name)
+            results = session.query(Collection, DataObject).filter( \
+                Criterion('like', DataObject.path,"%_kraken2_report.txt")).filter( \
+                Criterion('like', Collection.name, investigation.path + "/%"))
+            file_paths = set()
+            # Obtaining all files
+            logger.info("Collecting all paths")
+            index = 0
+            for index, r in enumerate(results):
+                file_path = r.get(Collection.name) + "/" + r.get(DataObject.name)
+                obj = session.data_objects.get(file_path)
+                output_dir = './data' + r.get(Collection.name)
+                pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+                output_file = './data' + file_path
+                file_paths.add(output_file)
+                if os.path.isfile(output_file): continue
+                with open(output_file, 'wb') as output, obj.open('r+') as input:
+                    copyfileobj(input, output)
+            # Skipp projects with no kraken files
+            if index == 0: continue
+            logger.info("Obtained " + str(index) + " kraken files")
+            # Creating biom file from kraken reports
+            command = "kraken-biom " + ' '.join(file_paths) + " --fmt json -o ./data/" + investigation.path + "/kraken.json"
+            logger.info("Command: " + command)
+            os.system(command)
+```
+%% Cell type:code id:comic-parker tags:
+``` python
+```
+%% Cell type:code id:corporate-knowing tags:
+``` python
+```
+%% Cell type:code id:matched-parallel tags:
+``` python
+```