Skip to content
Snippets Groups Projects
Commit 6aead70b authored by Koehorst, Jasper's avatar Koehorst, Jasper
Browse files

load into sparql endpoint function added

parent 51b719f0
No related branches found
No related tags found
1 merge request!1Dev to amplicon
%% Cell type:markdown id: tags:
# Downloading files from iRODS
%% Cell type:code id: tags:
```
import getpass
print("Username")
username=input()
```
%% Cell type:code id: tags:
```
print("Password")
password = getpass.getpass()
```
%% Cell type:markdown id: tags:
### iRODS dependencies
These dependencies are needed if not supplied from the docker file
%% Cell type:code id: tags:
```
# RDF Insertion
from SPARQLWrapper import SPARQLWrapper, JSON
# iRODS
from irods.session import iRODSSession
from irods.column import Criterion
from irods.models import DataObject, DataObjectMeta, Collection, CollectionMeta
from irods.session import iRODSSession
# Python
import os
import shutil
# Logging
import logging
# Authentication curl
import requests
from requests.auth import HTTPBasicAuth
LOGGING_FORMAT = '[%(asctime)-15s][%(levelname)-7s] %(message)s'
def get_logger(name):
logging.basicConfig(format=LOGGING_FORMAT)
logger = logging.getLogger(name)
logger.setLevel('DEBUG')
return logger
logger = get_logger("jupyter")
# Configurations
%config IPCompleter.greedy=True
```
%% Cell type:markdown id: tags:
### iRODS connection
%% Cell type:code id: tags:
```
from irods.session import iRODSSession
import ssl
# iRODS authentication information
# username = ""
# password = ""
host = "unlock-icat.irods.surfsara.nl"
port = "1247"
zone = "unlock"
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
'irods_client_server_policy': 'CS_NEG_REQUIRE',
'irods_encryption_algorithm': 'AES-256-CBC',
'irods_encryption_key_size': 32,
'irods_encryption_num_hash_rounds': 16,
'irods_encryption_salt_size': 8,
'ssl_context': context}
session = iRODSSession(host = host,
port = port,
user = username,
password = password,
zone = zone,
**ssl_settings)
coll = session.collections.get('/unlock')
print(coll)
logger.info(coll)
```
%% Cell type:markdown id: tags:
### List projects with access
This section lists all projects the current user has access to. This can be used to set the `IDENTIFIER` to the right project.
%% Cell type:code id: tags:
```
projects = session.collections.get("/unlock/projects").subcollections
for project in projects:
print(project.name)
logger.info(project.name)
```
%% Cell type:markdown id: tags:
### Project identifier selection
Which project you are planning to analysis, write down the full code below.
%% Cell type:code id: tags:
```
# Variable is for investigation but can be project / study / regex / etc...
IDENTIFIER = "P_EXPLODIV"
IDENTIFIER = "P_FIRM-Project"
```
%% Cell type:markdown id: tags:
### Downloading amplicon rdf files
### Downloading metadata rdf files
%% Cell type:code id: tags:outputPrepend
%% Cell type:code id: tags:
```
import irods.keywords as kw
# Playground to get all 16S analysis files...
logger.info("Querying iRODS")
results = session.query(Collection, DataObject).filter( \
Criterion('like', DataObject.path,"%"+ IDENTIFIER + "%.ttl")).filter( \
Criterion('like', DataObject.name, "%.ttl"))
file_paths = {}
logger.info("Collecting all paths")
for index, r in enumerate(results):
file_path = r.get(Collection.name) + "/" + r.get(DataObject.name)
if str(r.get(DataObject.name)).startswith("directory-"): continue
if "/unlock/trash/" in file_path: continue
file_paths[file_path] = {"size":r.get(DataObject.size)}
repo = file_path.split("/")[file_path.split("/").index("projects")+1]
logger.info("Retrieving "+str(len(file_paths)) +" files")
for index, file_path in enumerate(file_paths):
obj = session.data_objects.get(file_path)
path = obj.collection.path.replace("/unlock/projects/","")
os.makedirs(path, exist_ok=True)
destination = path + "/" + obj.name
if os.path.isfile(destination):
size = os.path.getsize(destination)
if not os.path.isfile(destination) or size != file_paths[file_path]["size"]:
logger.info("Retrieving " + obj.name + " " + str(index) + " of " + str(len(file_paths)))
options = {kw.FORCE_FLAG_KW:""}
session.data_objects.get(obj.path, destination, **options)
collection = session.collections.get('/'.join(obj.path.split("/")[:4]))
for file in collection.data_objects:
if file.name.endswith(".ttl"):
if os.path.isfile(repo + "/" + file.name):
os.remove(repo + "/" + file.name)
session.data_objects.get(file.path, repo)
logger.info("Project file retrieved " + repo + "/" + file.name)
print("File downloading finished")
logger.info("File downloading finished")
```
%% Cell type:markdown id: tags:
## Load files into a remote SPARQL endpoint
%% Cell type:code id: tags:
```
def curl(filename):
url = 'http://nvme1.wurnet.nl:7200/repositories/FIRM-Project_FIRM-Broilers/statements'
payload = open(filename,'rb').read()
headers = {'content-type': 'text/turtle'}
r = requests.post(url, data=payload, headers=headers, auth=HTTPBasicAuth(username, password), timeout=100)
#we shall store all the file names in this list
filelist = []
for root, dirs, files in os.walk(IDENTIFIER):
for file in files:
#append the file name to the list
filelist.append(os.path.join(root,file))
#print all the file names
for index, name in enumerate(filelist):
if name.endswith(".ttl"):
curl(name)
if name in loaded:
continue
if index % 10 == 0:
print(index, name.split("/")[-1], "of", len(filelist))
loaded.add(name)
```
%% Cell type:markdown id: tags:
## Creating a local Repository in graphdb
### RDF Repository
%% Cell type:code id: tags:
```
turtle = """
# RDF4J configuration template for a GraphDB Free repository
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix rep: <http://www.openrdf.org/config/repository#>.
@prefix sr: <http://www.openrdf.org/config/repository/sail#>.
@prefix sail: <http://www.openrdf.org/config/sail#>.
@prefix owlim: <http://www.ontotext.com/trree/owlim#>.
[] a rep:Repository ;
rep:repositoryID \"""" + repo + """\" ;
rdfs:label "GraphDB Free repository" ;
rep:repositoryImpl [
rep:repositoryType "graphdb:FreeSailRepository" ;
sr:sailImpl [
sail:sailType "graphdb:FreeSail" ;
owlim:base-URL "http://gbol.life/0.1/" ;
owlim:defaultNS "" ;
owlim:entity-index-size "10000000" ;
owlim:entity-id-size "32" ;
owlim:imports "" ;
owlim:repository-type "file-repository" ;
owlim:ruleset "rdfsplus-optimized" ;
owlim:storage-folder "storage" ;
owlim:enable-context-index "false" ;
owlim:enablePredicateList "true" ;
owlim:in-memory-literal-properties "true" ;
owlim:enable-literal-index "true" ;
owlim:check-for-inconsistencies "false" ;
owlim:disable-sameAs "false" ;
owlim:query-timeout "0" ;
owlim:query-limit-results "0" ;
owlim:throw-QueryEvaluationException-on-timeout "false" ;
owlim:read-only "false" ;
owlim:nonInterpretablePredicates "http://www.w3.org/2000/01/rdf-schema#label;http://www.w3.org/1999/02/22-rdf-syntax-ns#type;http://www.ontotext.com/owlim/ces#gazetteerConfig;http://www.ontotext.com/owlim/ces#metadataConfig" ;
]
].
"""
open("config.ttl", "w").write(turtle)
print("Config file created")
logger.info("Config file created")
```
%% Cell type:markdown id: tags:
### Loading RDF function (very fast)
%% Cell type:code id: tags:outputPrepend
%% Cell type:code id: tags:
```
# USE THE PRELOAD FUNCTION... Very fast :) but requires more memory (check docker memory settings when heap is a problem)
# Need to kill the graphdb java process unfortunately it does not show the graphdb name...
!pkill java
!./graphdb-free/bin/preload -x -s --force -c ./config.ttl -a 1 -b 1k -r ./$IDENTIFIER
# Run the starting RDF triple store section even when this is not finished so graphdb will start automagically
```
%% Cell type:markdown id: tags:
## Starting RDF triple store
%% Cell type:code id: tags:
```
# Starting graphdb again in daemon mode
# This needs to be done using the system command as ! does not seem to work for daemon processes.
os.system("./graphdb-free/bin/graphdb -d")
print("Starting graphdb can take a few minutes depending on the size of the database")
result = -1
import socket
import time
while result != 0:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('127.0.0.1', 7200))
if result == 0:
print("Graphdb started")
else:
pass # print("Port is not open")
sock.close()
time.sleep(1)
def start_graphdb():
os.system("./graphdb-free/bin/graphdb -d")
logger.info("Starting graphdb can take a few minutes depending on the size of the database")
result = -1
import socket
import time
while result != 0:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('127.0.0.1', 7200))
if result == 0:
logger.info("Graphdb started")
else:
pass # logger.info("Port is not open")
sock.close()
time.sleep(1)
start_graphdb()
```
%% Cell type:markdown id: tags:
### Less efficient but uses less memory
When loading the RDF files using the very fast method did not work due to memory constrains the following code can be enabled to use a slower but using less memory approach.
%% Cell type:code id: tags:
%% Cell type:code id: tags:outputPrepend
```
# Using the loadrdf function when not enough memory is available when using the preload method.
# Need to kill the graphdb java process unfortunately it does not show the graphdb name...
# Enable this
!pkill java
# Enable this
!./graphdb-free/bin/loadrdf --force -c ./config.ttl -m serial ./$IDENTIFIER
# Run the starting RDF triple store section even when this is not finished so graphdb will start automagically
start_graphdb()
```
%% Cell type:markdown id: tags:
### Way less efficient but uses less memory
%% Cell type:code id: tags:
```
# endpoint = "http://localhost:7200/repositories/"+repo+"/statements"
# sparql = SPARQLWrapper(endpoint)
# command = """curl -X POST --header "Content-Type:multipart/form-data" -F "config=@./config.ttl" "http://localhost:7200/rest/repositories\""""
# print(command)
# import subprocess
# os.system(command)
# command = "ls"
# subprocess.call(command, shell=True)
# for file_index, rdf in enumerate(rdf_loader):
# print("File",file_index + 1, "loaded", end="\r")
# command = """curl -X POST -H "Content-Type:application/x-turtle" -T """+rdf+" "+endpoint
# os.system(command)
```
%% Cell type:markdown id: tags:
### Use R scripts for analysis
See Amplicon.ipynb
%% Cell type:code id: tags:
```
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment