From 7490d6f648e011fd22715004169f4cf48a55bfbf Mon Sep 17 00:00:00 2001 From: Jasper Koehorst <jasper.koehorst@wur.nl> Date: Tue, 30 Mar 2021 18:07:05 +0200 Subject: [PATCH] kraken prepare and local endpoint loader added. biom rmd updates --- notebooks/SPARQLEndpointLoader.ipynb | 381 +++++++++++++++++++++++++++ notebooks/amplicon-biom.Rmd | 2 +- notebooks/kraken-biom.Rmd | 2 +- notebooks/kraken-prepare.ipynb | 222 ++++++++++++++++ 4 files changed, 605 insertions(+), 2 deletions(-) create mode 100644 notebooks/SPARQLEndpointLoader.ipynb create mode 100644 notebooks/kraken-prepare.ipynb diff --git a/notebooks/SPARQLEndpointLoader.ipynb b/notebooks/SPARQLEndpointLoader.ipynb new file mode 100644 index 0000000..8201e9a --- /dev/null +++ b/notebooks/SPARQLEndpointLoader.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SPARQL Endpoint loader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Username\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + " koehorst\n" + ] + } + ], + "source": [ + "import getpass\n", + "print(\"Username\")\n", + "username=input()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Password\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ··············\n" + ] + } + ], + "source": [ + "print(\"Password\")\n", + "password = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SPARQL database functions" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "def create_sparql_config(repo):\n", + " turtle = \"\"\"\n", + " # RDF4J configuration template for a GraphDB Free repository\n", + "\n", + " @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.\n", + " @prefix rep: <http://www.openrdf.org/config/repository#>.\n", + " @prefix sr: <http://www.openrdf.org/config/repository/sail#>.\n", + " @prefix sail: <http://www.openrdf.org/config/sail#>.\n", + " @prefix owlim: <http://www.ontotext.com/trree/owlim#>.\n", + "\n", + " [] a rep:Repository ;\n", + " rep:repositoryID \\\"\"\"\"+repo+\"\"\"\\\" ;\n", + " rdfs:label \"GraphDB Free repository\" ;\n", + " rep:repositoryImpl [\n", + " rep:repositoryType \"graphdb:FreeSailRepository\" ;\n", + " sr:sailImpl [\n", + " sail:sailType \"graphdb:FreeSail\" ;\n", + "\n", + " owlim:base-URL \"http://gbol.life/0.1/\" ;\n", + " owlim:defaultNS \"\" ;\n", + " owlim:entity-index-size \"10000000\" ;\n", + " owlim:entity-id-size \"32\" ;\n", + " owlim:imports \"\" ;\n", + " owlim:repository-type \"file-repository\" ;\n", + " owlim:ruleset \"rdfsplus-optimized\" ;\n", + " owlim:storage-folder \"storage\" ;\n", + "\n", + " owlim:enable-context-index \"false\" ;\n", + "\n", + " owlim:enablePredicateList \"true\" ;\n", + "\n", + " owlim:in-memory-literal-properties \"true\" ;\n", + " owlim:enable-literal-index \"true\" ;\n", + "\n", + " owlim:check-for-inconsistencies \"false\" ;\n", + " owlim:disable-sameAs \"false\" ;\n", + " owlim:query-timeout \"0\" ;\n", + " owlim:query-limit-results \"0\" ;\n", + " owlim:throw-QueryEvaluationException-on-timeout \"false\" ;\n", + " owlim:read-only \"false\" ;\n", + " owlim:nonInterpretablePredicates \"http://www.w3.org/2000/01/rdf-schema#label;http://www.w3.org/1999/02/22-rdf-syntax-ns#type;http://www.ontotext.com/owlim/ces#gazetteerConfig;http://www.ontotext.com/owlim/ces#metadataConfig\" ;\n", + " ]\n", + " ].\n", + " \"\"\"\n", + "\n", + " open(repo + \"_config.ttl\", \"w\").write(turtle)\n", + " logger.info(\"Config file created\")\n", + "\n", + "\n", + "def curl(repo, filename):\n", + " url = 'http://graphdb:7200/repositories/' + repo + \"/statements\"\n", + " payload = open(filename,'rb').read()\n", + " headers = {'content-type': 'text/turtle'}\n", + " index = 0\n", + " while True:\n", + " index = index + 1\n", + " r = requests.post(url, data=payload, headers=headers, auth=HTTPBasicAuth(username, password), timeout=1000)\n", + " if int(r.status_code) == 200:\n", + " return True\n", + " if int(r.status_code) == 204:\n", + " # TODO\n", + " return True\n", + " if index > 2:\n", + " return r\n", + " if int(r.status_code) == 502:\n", + " return r\n", + "# if int(r.status_code) == 404:\n", + "# return r\n", + " print(\"Request post failed... exit code \",r.status_code,\" retrying \", index)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### iRODS dependencies \n", + "These dependencies are needed if not supplied from the docker file" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "# RDF Insertion\n", + "from SPARQLWrapper import SPARQLWrapper, JSON\n", + "# iRODS\n", + "from irods.session import iRODSSession\n", + "from irods.column import Criterion\n", + "from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta\n", + "from irods.session import iRODSSession\n", + "# Python\n", + "import os\n", + "import shutil\n", + "# Logging\n", + "import logging\n", + "# Authentication curl\n", + "import requests\n", + "from requests.auth import HTTPBasicAuth\n", + "\n", + "LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'\n", + "\n", + "def get_logger(name):\n", + " logging.basicConfig(format=LOGGING_FORMAT)\n", + " logger = logging.getLogger(name)\n", + " logger.setLevel('DEBUG')\n", + " return logger\n", + "\n", + "logger = get_logger(\"jupyter\")\n", + "# Configurations\n", + "%config IPCompleter.greedy=True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### iRODS connection" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-03-29 15:25:11,346][INFO ] <iRODSCollection 10005 b'unlock'>\n" + ] + } + ], + "source": [ + "from irods.session import iRODSSession\n", + "import ssl\n", + "\n", + "# iRODS authentication information\n", + "host = \"unlock-icat.irods.surfsara.nl\"\n", + "port = \"1247\"\n", + "zone = \"unlock\"\n", + "\n", + "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n", + "\n", + "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n", + " 'irods_client_server_policy': 'CS_NEG_REQUIRE',\n", + " 'irods_encryption_algorithm': 'AES-256-CBC',\n", + " 'irods_encryption_key_size': 32,\n", + " 'irods_encryption_num_hash_rounds': 16,\n", + " 'irods_encryption_salt_size': 8,\n", + " 'ssl_context': context}\n", + "\n", + "\n", + "session = iRODSSession(host = host,\n", + " port = port,\n", + " user = username,\n", + " password = password,\n", + " zone = zone,\n", + " **ssl_settings)\n", + "\n", + "coll = session.collections.get('/unlock')\n", + "logger.info(coll)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List projects with access\n", + "This section lists all projects the current user has access to. This can be used to set the `IDENTIFIER` to the right project." + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-03-29 15:25:11,386][INFO ] P_Deltares\n", + "[2021-03-29 15:25:11,407][INFO ] P_Deltares/I_vitens\n", + "[2021-03-29 15:25:11,408][INFO ] P_E2BN\n", + "[2021-03-29 15:25:11,424][INFO ] P_E2BN/I_PRED\n", + "[2021-03-29 15:25:11,425][INFO ] P_EXPLODIV\n", + "[2021-03-29 15:25:11,447][INFO ] P_EXPLODIV/I_BIOGAS\n", + "[2021-03-29 15:25:11,448][INFO ] P_EXPLODIV/I_UNCOUPLED\n", + "[2021-03-29 15:25:11,448][INFO ] P_FIRM-Project\n", + "[2021-03-29 15:25:11,467][INFO ] P_FIRM-Project/I_FIRM-Broilers\n", + "[2021-03-29 15:25:11,468][INFO ] P_MIB-Amplicon\n", + "[2021-03-29 15:25:11,484][INFO ] P_MIB-Amplicon/I_Mocks\n", + "[2021-03-29 15:25:11,485][INFO ] P_MIB-Amplicon/I_Poultry_16S_MIB\n", + "[2021-03-29 15:25:11,485][INFO ] P_SIAM\n", + "[2021-03-29 15:25:11,505][INFO ] P_SIAM/I_DbMM\n", + "[2021-03-29 15:25:11,506][INFO ] P_UNLOCK\n", + "[2021-03-29 15:25:11,528][INFO ] P_UNLOCK/I_CAMI\n", + "[2021-03-29 15:25:11,529][INFO ] P_UNLOCK/I_INVESTIGATION_TEST\n", + "[2021-03-29 15:25:11,530][INFO ] P_UNLOCK/I_SRA_Amplicon\n" + ] + } + ], + "source": [ + "projects = session.collections.get(\"/unlock/projects\").subcollections\n", + "for project in projects:\n", + " logger.info(project.name)\n", + " for investigation in project.subcollections:\n", + " if investigation.name.startswith(\"I_\"):\n", + " logger.info(project.name + \"/\" + investigation.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Downloading metadata rdf files" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-03-29 15:25:11,666][INFO ] P_Deltares\n", + "[2021-03-29 15:25:12,294][INFO ] Config file created\n", + "[2021-03-29 15:25:13,300][INFO ] P_E2BN\n", + "[2021-03-29 15:25:15,387][INFO ] Config file created\n", + "[2021-03-29 15:25:49,299][INFO ] P_EXPLODIV\n", + "[2021-03-29 15:25:49,341][INFO ] P_FIRM-Project\n", + "[2021-03-29 15:25:49,372][INFO ] P_MIB-Amplicon\n", + "[2021-03-29 15:25:50,038][INFO ] Config file created\n", + "[2021-03-29 15:26:02,140][INFO ] Config file created\n", + "[2021-03-29 15:26:56,839][INFO ] Config file created\n", + "[2021-03-29 15:26:58,794][INFO ] P_SIAM\n", + "[2021-03-29 15:26:58,832][INFO ] P_UNLOCK\n", + "[2021-03-29 15:26:59,083][INFO ] Config file created\n", + "[2021-03-29 15:27:02,962][INFO ] Config file created\n" + ] + } + ], + "source": [ + "projects = session.collections.get(\"/unlock/projects\").subcollections\n", + "for project in projects:\n", + " logger.info(project.name)\n", + " for investigation in project.subcollections:\n", + " if investigation.name.startswith(\"I_\"):\n", + " for obj in investigation.data_objects:\n", + " if obj.name.endswith(\".ttl\"):\n", + " if obj.name.startswith(\".\"): continue\n", + " local_path = \"./data/\" + investigation.path\n", + " destination = local_path + \"/\" + obj.name\n", + " os.makedirs(local_path, exist_ok=True)\n", + " options = {kw.FORCE_FLAG_KW:\"\"}\n", + " session.data_objects.get(obj.path, destination, **options)\n", + " endpoint_name = project.name + \"_\" + investigation.name\n", + " create_sparql_config(endpoint_name)\n", + " # Execute config file to create repository\n", + " command = \"curl -X POST graphdb:7200/rest/repositories -H 'Accept: application/json' -H 'Content-Type: multipart/form-data' -F config=@\"+endpoint_name+\"_config.ttl\"\n", + " os.system(command)\n", + " # Load files\n", + " curl(endpoint_name, destination)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": false, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/amplicon-biom.Rmd b/notebooks/amplicon-biom.Rmd index 3790f41..47defc3 100644 --- a/notebooks/amplicon-biom.Rmd +++ b/notebooks/amplicon-biom.Rmd @@ -52,7 +52,7 @@ WHERE { ?ISAsample unlock:assay ?assay . ?assay schema:identifier ?id . }" -results = SPARQL(url = "http://172.18.0.2:7200/repositories/P_E2BN_I_PRED", format = "csv", query = query)$results +results = SPARQL(url = "http://graphdb:7200/repositories/P_E2BN_I_PRED", format = "csv", query = query)$results results$predicate = gsub(x = results$predicate, pattern = ".*/", replacement = "") metadata <- dcast(results, id ~ predicate, value.var = "object") rownames(metadata) = metadata$id diff --git a/notebooks/kraken-biom.Rmd b/notebooks/kraken-biom.Rmd index 8c7a422..fc2d0bd 100644 --- a/notebooks/kraken-biom.Rmd +++ b/notebooks/kraken-biom.Rmd @@ -21,7 +21,7 @@ The preview shows you a rendered HTML copy of the contents of the editor. Conseq ```{r} # Load Kraken json object generated from jupyter hub -physeq = import_biom("data/P_Deltares-I_vitens_kraken.json") +physeq = import_biom("data/unlock/projects/P_Deltares/I_vitens/kraken.json") # Fix rank naming issue from kraken data colnames(tax_table(physeq)) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") # Correct input names diff --git a/notebooks/kraken-prepare.ipynb b/notebooks/kraken-prepare.ipynb new file mode 100644 index 0000000..177bd3a --- /dev/null +++ b/notebooks/kraken-prepare.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "typical-perfume", + "metadata": {}, + "source": [ + "# Generate kraken biom files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bridal-niger", + "metadata": {}, + "outputs": [], + "source": [ + "# Install kraken-biom package\n", + "!pip install kraken-biom" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pharmaceutical-pattern", + "metadata": {}, + "outputs": [], + "source": [ + "# Dependencies\n", + "import os\n", + "import logging\n", + "from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta\n", + "from irods.column import Criterion\n", + "import getpass\n", + "from shutil import copyfileobj\n", + "import pathlib\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "right-allocation", + "metadata": {}, + "outputs": [], + "source": [ + "# Generic functions\n", + "LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'\n", + "\n", + "def get_logger(name):\n", + " logging.basicConfig(format=LOGGING_FORMAT)\n", + " logger = logging.getLogger(name)\n", + " logger.setLevel('DEBUG')\n", + " return logger\n", + "\n", + "logger = get_logger(\"jupyter\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "minor-square", + "metadata": {}, + "outputs": [], + "source": [ + "## iRODS authentication" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "recognized-camcorder", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Username\")\n", + "username=input()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "designed-surveillance", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Password\")\n", + "password = getpass.getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "flexible-sunglasses", + "metadata": {}, + "outputs": [], + "source": [ + "from irods.session import iRODSSession\n", + "import ssl\n", + "\n", + "# iRODS authentication information\n", + "host = \"unlock-icat.irods.surfsara.nl\"\n", + "port = \"1247\"\n", + "zone = \"unlock\"\n", + "\n", + "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n", + "\n", + "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n", + " 'irods_client_server_policy': 'CS_NEG_REQUIRE',\n", + " 'irods_encryption_algorithm': 'AES-256-CBC',\n", + " 'irods_encryption_key_size': 32,\n", + " 'irods_encryption_num_hash_rounds': 16,\n", + " 'irods_encryption_salt_size': 8,\n", + " 'ssl_context': context}\n", + "\n", + "\n", + "session = iRODSSession(host = host,\n", + " port = port,\n", + " user = username,\n", + " password = password,\n", + " zone = zone,\n", + " **ssl_settings)\n", + "\n", + "projects = session.collections.get(\"/unlock/projects\").subcollections\n", + "for project in projects:\n", + " for investigation in project.subcollections:\n", + " if investigation.name.startswith(\"I_\"):\n", + " logger.info(\"Access to project: \" + project.name + \" and investigation \" + investigation.name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "surrounded-assurance", + "metadata": {}, + "outputs": [], + "source": [ + "# Obtain all kraken files for a given project and investigation\n", + "# project = \"P_Deltares\"\n", + "# investigation = \"I_vitens\"\n", + "for project in projects:\n", + " for investigation in project.subcollections:\n", + " if investigation.name.startswith(\"I_\"):\n", + " logger.info(\"Access to project: \" + project.name + \" and investigation \" + investigation.name)\n", + " \n", + " results = session.query(Collection, DataObject).filter( \\\n", + " Criterion('like', DataObject.path,\"%_kraken2_report.txt\")).filter( \\\n", + " Criterion('like', Collection.name, investigation.path + \"/%\"))\n", + "\n", + " file_paths = set()\n", + "\n", + " # Obtaining all files\n", + " logger.info(\"Collecting all paths\")\n", + " index = 0\n", + " for index, r in enumerate(results):\n", + " file_path = r.get(Collection.name) + \"/\" + r.get(DataObject.name)\n", + " obj = session.data_objects.get(file_path)\n", + " output_dir = './data' + r.get(Collection.name)\n", + " pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)\n", + " output_file = './data' + file_path\n", + " file_paths.add(output_file)\n", + " if os.path.isfile(output_file): continue\n", + " with open(output_file, 'wb') as output, obj.open('r+') as input:\n", + " copyfileobj(input, output)\n", + "\n", + " # Skipp projects with no kraken files\n", + " if index == 0: continue\n", + " \n", + " logger.info(\"Obtained \" + str(index) + \" kraken files\")\n", + " \n", + " # Creating biom file from kraken reports\n", + " command = \"kraken-biom \" + ' '.join(file_paths) + \" --fmt json -o ./data/\" + investigation.path + \"/kraken.json\"\n", + " logger.info(\"Command: \" + command)\n", + " os.system(command)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "comic-parker", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "corporate-knowing", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "matched-parallel", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab