load into sparql endpoint function added

6aead70b · Koehorst, Jasper · 51b719f0 · 6aead70b
Commit 6aead70b authored 4 years ago by Koehorst, Jasper
--- a/Download.ipynb
+++ b/Download.ipynb
@@ -54,6 +54,9 @@
    "import shutil\n",
    "# Logging\n",
    "import logging\n",
+    "# Authentication curl\n",
+    "import requests\n",
+    "from requests.auth import HTTPBasicAuth\n",
    "\n",
    "LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'\n",
    "\n",
@@ -85,8 +88,6 @@
    "import ssl\n",
    "\n",
    "# iRODS authentication information\n",
-    "# username = \"\"\n",
-    "# password = \"\"\n",
    "host = \"unlock-icat.irods.surfsara.nl\"\n",
    "port = \"1247\"\n",
    "zone = \"unlock\"\n",
@@ -110,7 +111,7 @@
    "                  **ssl_settings)\n",
    "\n",
    "coll = session.collections.get('/unlock')\n",
-    "print(coll)\n"
+    "logger.info(coll)\n"
   ]
  },
  {
@@ -129,7 +130,7 @@
   "source": [
    "projects = session.collections.get(\"/unlock/projects\").subcollections\n",
    "for project in projects:\n",
-    "    print(project.name)"
+    "    logger.info(project.name)"
   ]
  },
  {
@@ -147,14 +148,14 @@
   "outputs": [],
   "source": [
    "# Variable is for investigation but can be project / study / regex / etc...\n",
-    "IDENTIFIER = \"P_EXPLODIV\""
+    "IDENTIFIER = \"P_FIRM-Project\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Downloading amplicon rdf files"
+    "### Downloading metadata rdf files"
   ]
  },
  {
@@ -162,9 +163,7 @@
   "execution_count": null,
   "metadata": {
    "scrolled": true,
-    "tags": [
-     "outputPrepend"
-    ]
+    "tags": []
   },
   "outputs": [],
   "source": [
@@ -211,13 +210,54 @@
    "        session.data_objects.get(file.path, repo)\n",
    "        logger.info(\"Project file retrieved \" + repo + \"/\" + file.name)\n",
    "        \n",
-    "print(\"File downloading finished\")"
+    "logger.info(\"File downloading finished\")"
+   ]
+  },
+  {
+   "source": [
+    "## Load files into a remote SPARQL endpoint"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def curl(filename):\n",
+    "    url = 'http://nvme1.wurnet.nl:7200/repositories/FIRM-Project_FIRM-Broilers/statements'\n",
+    "    payload = open(filename,'rb').read()\n",
+    "    headers = {'content-type': 'text/turtle'}\n",
+    "    r = requests.post(url, data=payload, headers=headers, auth=HTTPBasicAuth(username, password), timeout=100)\n",
+    "\n",
+    "#we shall store all the file names in this list\n",
+    "filelist = []\n",
+    "\n",
+    "for root, dirs, files in os.walk(IDENTIFIER):\n",
+    "\tfor file in files:\n",
+    "        #append the file name to the list\n",
+    "\t\tfilelist.append(os.path.join(root,file))\n",
+    "\n",
+    "#print all the file names\n",
+    "for index, name in enumerate(filelist):\n",
+    "    if name.endswith(\".ttl\"):\n",
+    "        curl(name)\n",
+    "        if name in loaded:\n",
+    "            continue\n",
+    "        if index % 10 == 0:\n",
+    "            print(index, name.split(\"/\")[-1], \"of\", len(filelist))\n",
+    "            loaded.add(name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
+    "## Creating a local Repository in graphdb\n",
    "### RDF Repository"
   ]
  },
@@ -272,7 +312,7 @@
    "    \"\"\"\n",
    "\n",
    "open(\"config.ttl\", \"w\").write(turtle)\n",
-    "print(\"Config file created\")\n"
+    "logger.info(\"Config file created\")\n"
   ]
  },
  {
@@ -287,9 +327,7 @@
   "execution_count": null,
   "metadata": {
    "scrolled": true,
-    "tags": [
-     "outputPrepend"
-    ]
+    "tags": []
   },
   "outputs": [],
   "source": [
@@ -315,20 +353,23 @@
   "source": [
    "# Starting graphdb again in daemon mode\n",
    "# This needs to be done using the system command as ! does not seem to work for daemon processes.\n",
-    "os.system(\"./graphdb-free/bin/graphdb -d\")\n",
-    "print(\"Starting graphdb can take a few minutes depending on the size of the database\")\n",
-    "result = -1\n",
-    "import socket\n",
-    "import time\n",
-    "while result != 0:\n",
-    "    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
-    "    result = sock.connect_ex(('127.0.0.1', 7200))\n",
-    "    if result == 0:\n",
-    "        print(\"Graphdb started\")\n",
-    "    else:\n",
-    "        pass # print(\"Port is not open\")\n",
-    "    sock.close()\n",
-    "    time.sleep(1)"
+    "def start_graphdb():\n",
+    "    os.system(\"./graphdb-free/bin/graphdb -d\")\n",
+    "    logger.info(\"Starting graphdb can take a few minutes depending on the size of the database\")\n",
+    "    result = -1\n",
+    "    import socket\n",
+    "    import time\n",
+    "    while result != 0:\n",
+    "        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
+    "        result = sock.connect_ex(('127.0.0.1', 7200))\n",
+    "        if result == 0:\n",
+    "            logger.info(\"Graphdb started\")\n",
+    "        else:\n",
+    "            pass # logger.info(\"Port is not open\")\n",
+    "        sock.close()\n",
+    "        time.sleep(1)\n",
+    "\n",
+    "start_graphdb()"
   ]
  },
  {
@@ -342,7 +383,11 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "outputPrepend"
+    ]
+   },
   "outputs": [],
   "source": [
    "# Using the loadrdf function when not enough memory is available when using the preload method.\n",
@@ -351,7 +396,8 @@
    "!pkill java\n",
    "# Enable this \n",
    "!./graphdb-free/bin/loadrdf --force -c ./config.ttl -m serial ./$IDENTIFIER\n",
-    "# Run the starting RDF triple store section even when this is not finished so graphdb will start automagically"
+    "# Run the starting RDF triple store section even when this is not finished so graphdb will start automagically\n",
+    "start_graphdb()"
   ]
  },
  {

 %% Cell type:markdown id: tags:

 # Downloading files from iRODS

 %% Cell type:code id: tags:

 ``` 
 import getpass
 print("Username")
 username=input()
 ```

 %% Cell type:code id: tags:

 ``` 
 print("Password")
 password = getpass.getpass()
 ```

 %% Cell type:markdown id: tags:

 ### iRODS dependencies
 These dependencies are needed if not supplied from the docker file

 %% Cell type:code id: tags:

 ``` 
 # RDF Insertion
 from SPARQLWrapper import SPARQLWrapper, JSON
 # iRODS
 from irods.session import iRODSSession
 from irods.column import Criterion
 from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta
 from irods.session import iRODSSession
 # Python
 import os
 import shutil
 # Logging
 import logging
+# Authentication curl
+import requests
+from requests.auth import HTTPBasicAuth

 LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'

 def get_logger(name):
    logging.basicConfig(format=LOGGING_FORMAT)
    logger = logging.getLogger(name)
    logger.setLevel('DEBUG')
    return logger

 logger = get_logger("jupyter")
 # Configurations
 %config IPCompleter.greedy=True
 ```

 %% Cell type:markdown id: tags:

 ### iRODS connection

 %% Cell type:code id: tags:

 ``` 
 from irods.session import iRODSSession
 import ssl

 # iRODS authentication information
-# username = ""
-# password = ""
 host = "unlock-icat.irods.surfsara.nl"
 port = "1247"
 zone = "unlock"

 context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)

 ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
                'irods_client_server_policy': 'CS_NEG_REQUIRE',
                'irods_encryption_algorithm': 'AES-256-CBC',
                'irods_encryption_key_size': 32,
                'irods_encryption_num_hash_rounds': 16,
                'irods_encryption_salt_size': 8,
                'ssl_context': context}


 session = iRODSSession(host = host,
                  port = port,
                  user = username,
                  password = password,
                  zone = zone,
                  **ssl_settings)

 coll = session.collections.get('/unlock')
-print(coll)
+logger.info(coll)
 ```

 %% Cell type:markdown id: tags:

 ### List projects with access
 This section lists all projects the current user has access to. This can be used to set the `IDENTIFIER` to the right project.

 %% Cell type:code id: tags:

 ``` 
 projects = session.collections.get("/unlock/projects").subcollections
 for project in projects:
-    print(project.name)
+    logger.info(project.name)
 ```

 %% Cell type:markdown id: tags:

 ### Project identifier selection
 Which project you are planning to analysis, write down the full code below.

 %% Cell type:code id: tags:

 ``` 
 # Variable is for investigation but can be project / study / regex / etc...
-IDENTIFIER = "P_EXPLODIV"
+IDENTIFIER = "P_FIRM-Project"
 ```

 %% Cell type:markdown id: tags:

-### Downloading amplicon rdf files
+### Downloading metadata rdf files

-%% Cell type:code id: tags:outputPrepend
+%% Cell type:code id: tags:

 ``` 
 import irods.keywords as kw

 # Playground to get all 16S analysis files...
 logger.info("Querying iRODS")

 results = session.query(Collection, DataObject).filter( \
    Criterion('like', DataObject.path,"%"+ IDENTIFIER + "%.ttl")).filter( \
    Criterion('like', DataObject.name, "%.ttl"))

 file_paths = {}

 logger.info("Collecting all paths")
 for index, r in enumerate(results):
    file_path = r.get(Collection.name) + "/" + r.get(DataObject.name)
    if str(r.get(DataObject.name)).startswith("directory-"): continue
    if "/unlock/trash/" in file_path: continue
    file_paths[file_path] = {"size":r.get(DataObject.size)}

 repo = file_path.split("/")[file_path.split("/").index("projects")+1]

 logger.info("Retrieving "+str(len(file_paths)) +" files")

 for index, file_path in enumerate(file_paths):
    obj = session.data_objects.get(file_path)
    path = obj.collection.path.replace("/unlock/projects/","")
    os.makedirs(path, exist_ok=True)

    destination = path + "/" + obj.name
    if os.path.isfile(destination):
        size = os.path.getsize(destination)
    if not os.path.isfile(destination) or size != file_paths[file_path]["size"]:
        logger.info("Retrieving " + obj.name + " " + str(index) + " of " + str(len(file_paths)))
        options = {kw.FORCE_FLAG_KW:""}
        session.data_objects.get(obj.path, destination, **options)

 collection = session.collections.get('/'.join(obj.path.split("/")[:4]))
 for file in collection.data_objects:
    if file.name.endswith(".ttl"):
        if os.path.isfile(repo + "/" + file.name):
            os.remove(repo + "/" + file.name)
        session.data_objects.get(file.path, repo)
        logger.info("Project file retrieved " + repo + "/" + file.name)

-print("File downloading finished")
+logger.info("File downloading finished")
+```
+
+%% Cell type:markdown id: tags:
+
+## Load files into a remote SPARQL endpoint
+
+%% Cell type:code id: tags:
+
+``` 
+def curl(filename):
+    url = 'http://nvme1.wurnet.nl:7200/repositories/FIRM-Project_FIRM-Broilers/statements'
+    payload = open(filename,'rb').read()
+    headers = {'content-type': 'text/turtle'}
+    r = requests.post(url, data=payload, headers=headers, auth=HTTPBasicAuth(username, password), timeout=100)
+
+#we shall store all the file names in this list
+filelist = []
+
+for root, dirs, files in os.walk(IDENTIFIER):
+	for file in files:
+        #append the file name to the list
+		filelist.append(os.path.join(root,file))
+
+#print all the file names
+for index, name in enumerate(filelist):
+    if name.endswith(".ttl"):
+        curl(name)
+        if name in loaded:
+            continue
+        if index % 10 == 0:
+            print(index, name.split("/")[-1], "of", len(filelist))
+            loaded.add(name)
 ```

 %% Cell type:markdown id: tags:

+## Creating a local Repository in graphdb
 ### RDF Repository

 %% Cell type:code id: tags:

 ``` 
 turtle = """
 # RDF4J configuration template for a GraphDB Free repository

 @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
 @prefix rep: <http://www.openrdf.org/config/repository#>.
 @prefix sr: <http://www.openrdf.org/config/repository/sail#>.
 @prefix sail: <http://www.openrdf.org/config/sail#>.
 @prefix owlim: <http://www.ontotext.com/trree/owlim#>.

 [] a rep:Repository ;
    rep:repositoryID \"""" + repo + """\" ;
    rdfs:label "GraphDB Free repository" ;
    rep:repositoryImpl [
        rep:repositoryType "graphdb:FreeSailRepository" ;
        sr:sailImpl [
            sail:sailType "graphdb:FreeSail" ;

            owlim:base-URL "http://gbol.life/0.1/" ;
            owlim:defaultNS "" ;
            owlim:entity-index-size "10000000" ;
            owlim:entity-id-size  "32" ;
            owlim:imports "" ;
            owlim:repository-type "file-repository" ;
            owlim:ruleset "rdfsplus-optimized" ;
            owlim:storage-folder "storage" ;

            owlim:enable-context-index "false" ;

            owlim:enablePredicateList "true" ;

            owlim:in-memory-literal-properties "true" ;
            owlim:enable-literal-index "true" ;

            owlim:check-for-inconsistencies "false" ;
            owlim:disable-sameAs  "false" ;
            owlim:query-timeout  "0" ;
            owlim:query-limit-results  "0" ;
            owlim:throw-QueryEvaluationException-on-timeout "false" ;
            owlim:read-only "false" ;
            owlim:nonInterpretablePredicates "http://www.w3.org/2000/01/rdf-schema#label;http://www.w3.org/1999/02/22-rdf-syntax-ns#type;http://www.ontotext.com/owlim/ces#gazetteerConfig;http://www.ontotext.com/owlim/ces#metadataConfig" ;
        ]
    ].
    """

 open("config.ttl", "w").write(turtle)
-print("Config file created")
+logger.info("Config file created")
 ```

 %% Cell type:markdown id: tags:

 ### Loading RDF function (very fast)

-%% Cell type:code id: tags:outputPrepend
+%% Cell type:code id: tags:

 ``` 
 # USE THE PRELOAD FUNCTION... Very fast :) but requires more memory (check docker memory settings when heap is a problem)
 # Need to kill the graphdb java process unfortunately it does not show the graphdb name...
 !pkill java
 !./graphdb-free/bin/preload -x -s --force -c ./config.ttl -a 1 -b 1k -r ./$IDENTIFIER
 # Run the starting RDF triple store section even when this is not finished so graphdb will start automagically
 ```

 %% Cell type:markdown id: tags:

 ## Starting RDF triple store

 %% Cell type:code id: tags:

 ``` 
 # Starting graphdb again in daemon mode
 # This needs to be done using the system command as ! does not seem to work for daemon processes.
-os.system("./graphdb-free/bin/graphdb -d")
-print("Starting graphdb can take a few minutes depending on the size of the database")
-result = -1
-import socket
-import time
-while result != 0:
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    result = sock.connect_ex(('127.0.0.1', 7200))
-    if result == 0:
-        print("Graphdb started")
-    else:
-        pass # print("Port is not open")
-    sock.close()
-    time.sleep(1)
+def start_graphdb():
+    os.system("./graphdb-free/bin/graphdb -d")
+    logger.info("Starting graphdb can take a few minutes depending on the size of the database")
+    result = -1
+    import socket
+    import time
+    while result != 0:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        result = sock.connect_ex(('127.0.0.1', 7200))
+        if result == 0:
+            logger.info("Graphdb started")
+        else:
+            pass # logger.info("Port is not open")
+        sock.close()
+        time.sleep(1)
+
+start_graphdb()
 ```

 %% Cell type:markdown id: tags:

 ### Less efficient but uses less memory
 When loading the RDF files using the very fast method did not work due to memory constrains the following code can be enabled to use a slower but using less memory approach.

-%% Cell type:code id: tags:
+%% Cell type:code id: tags:outputPrepend

 ``` 
 # Using the loadrdf function when not enough memory is available when using the preload method.
 # Need to kill the graphdb java process unfortunately it does not show the graphdb name...
 # Enable this
 !pkill java
 # Enable this
 !./graphdb-free/bin/loadrdf --force -c ./config.ttl -m serial ./$IDENTIFIER
 # Run the starting RDF triple store section even when this is not finished so graphdb will start automagically
+start_graphdb()
 ```

 %% Cell type:markdown id: tags:

 ### Way less efficient but uses less memory

 %% Cell type:code id: tags:

 ``` 
 # endpoint = "http://localhost:7200/repositories/"+repo+"/statements"
 # sparql = SPARQLWrapper(endpoint)


 # command = """curl -X POST --header "Content-Type:multipart/form-data" -F "config=@./config.ttl" "http://localhost:7200/rest/repositories\""""
 # print(command)
 # import subprocess
 # os.system(command)
 # command = "ls"
 # subprocess.call(command, shell=True)

 # for file_index, rdf in enumerate(rdf_loader):
 #     print("File",file_index + 1, "loaded", end="\r")
 #     command = """curl -X POST -H "Content-Type:application/x-turtle" -T """+rdf+" "+endpoint
 #     os.system(command)

 ```

 %% Cell type:markdown id: tags:

 ### Use R scripts for analysis
 See Amplicon.ipynb

 %% Cell type:code id: tags:

 ``` 
 ```