Commit 63a099d9 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Created dataconversion; made API more simple

parent 5da6afe7
......@@ -3,6 +3,4 @@
# User-specific files
# Dirs
Input/
Output/
__pycache__/
\ No newline at end of file
......@@ -3,7 +3,7 @@
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
import mcra
import dataconversion
import pandas as pd
from datetime import datetime
import textwrap
......@@ -17,7 +17,7 @@ def print_as_link(text):
# These are the files we work with
# Create list
dataset = mcra.DataSet(
dataset = dataconversion.DataSet(
opening='(c) ' + datetime.now().strftime('%Y')
+ ' Biometris, Wageningen University and Research.',
description='Converts the EFSA CAPEG database Excel sheet into MCRA '
......@@ -60,12 +60,11 @@ dataset.add(
default_dir='Output')
#
dataset.add(
name='compounds',
name='substances',
short_argument='-s',
help='The (output) substances file - '
+ 'format: csv (Comma Seperated).',
default_name='Compounds.zip',
# default_name='Compounds.csv',
default_name='Substances.csv',
default_dir='Output')
#
......@@ -74,27 +73,35 @@ dataset.add(
dataset.init()
# To abbreviate
capeg = dataset.capeg.sheet
# We need to clean up the table firstly
# Remove all CasNumbers with na
capeg.drop(capeg.loc[capeg['casNumber'] == 'na'].index, inplace=True)
# FIRST The effects table
# Add the fields for the effects table
capeg.mcra.addcolumn(
{'idEffect', 'Name', 'Description', 'Reference'})
{'idEffect', 'Name', 'Description', 'Reference', 'targetL1'})
# Create extra colum for proper CAS1 names
capeg['targetL1'] = capeg['target_CAG1'].str.split().str[0].str.strip()
capeg['targetL1'].replace('Bones', 'Skeleton', inplace=True)
capeg['targetL1'].replace('Bone', 'Bone marrow', inplace=True)
# Create tempcopy
capeg2 = capeg.copy(deep=True)
# Fill the idEffectA and idEffectC eg. L1-Liver-Acute
capeg['idEffect'] = 'L1-' + \
capeg['target_CAG1'].str.split().str[0].str.strip() + '-Acute'
capeg['targetL1'].str.split().str[0].str.strip() + '-Acute'
capeg2['idEffect'] = 'L1-' + \
capeg2['target_CAG1'].str.split().str[0].str.strip() + '-Chronic'
capeg2['targetL1'].str.split().str[0].str.strip() + '-Chronic'
# Description
capeg['Description'] = 'Acute adverse effects on ' + \
capeg['target_CAG1'].str.lower() + '.'
capeg['targetL1'].str.lower() + '.'
capeg2['Description'] = 'Chronic adverse effects on ' + \
capeg2['target_CAG1'].str.lower() + '.'
capeg2['targetL1'].str.lower() + '.'
# Combine the sheets, append the second after the first
capeg = capeg.append(capeg2, ignore_index=True)
# Set the name
capeg['Name'] = capeg['target_CAG1']
capeg['Name'] = capeg['targetL1']
# Set the reference
capeg['Reference'] = ''
......@@ -105,10 +112,10 @@ effects_header = ['idEffect', 'CodeSystem', 'Name', 'Description',
'KeyEventCell', 'AOPwikiKE', 'Reference']
dataset.effects.sheet = capeg.drop_duplicates(
subset = ['idEffect'], ignore_index=True)[
['idEffect','Name','Description','Reference']]
subset=['idEffect'], ignore_index=True)[
['idEffect', 'Name', 'Description', 'Reference']]
dataset.effects.close(header=effects_header)
dataset.effects.sheet.mcra.keepcolumn(effects_header)
# SECOND The Assessment group membership models table
# Remove and add used columns to clear them
......@@ -119,11 +126,11 @@ capeg.mcra.addcolumn(['id', 'Name', 'Description', 'Reference'])
capeg['id'] = 'AG1-' + \
capeg['idEffect'].str.split('-').str[1:].str.join('-')
# Name
capeg['Name'] = 'CAG ' + capeg['target_CAG1'].str.lower()
capeg['Name'] = 'CAG ' + capeg['targetL1'].str.lower()
# Description
capeg['Description'] = \
'Cummulative assesment group for adverse effects on ' + \
capeg['target_CAG1'].str.lower() + '.'
capeg['targetL1'].str.lower() + '.'
# Reference
capeg['Reference'] = 'https://doi.org/10.2903/sp.efsa.2012.EN-269'
......@@ -132,10 +139,10 @@ agmm_header = ['id', 'Name', 'Description', 'idEffect', 'Accuracy',
'Sensitivity', 'Specificity', 'Reference']
dataset.agmm.sheet = capeg.drop_duplicates(
subset = ['id'], ignore_index=True)[
subset=['id'], ignore_index=True)[
['id', 'idEffect', 'Name', 'Description', 'Reference']]
dataset.agmm.close(header=agmm_header)
dataset.agmm.sheet.mcra.keepcolumn(agmm_header)
# THIRD The Substances table
# Remove and add used columns to clear them
......@@ -148,13 +155,14 @@ capeg['idSubstance'] = capeg['casNumber']
capeg['Name'] = capeg['chemicalName']
# Done, now wrap this table up
compounds_header = ['idSubstance', 'Name', 'Description',
'ConcentrationUnit', 'CramerClass', 'MolecularMass']
substances_header = ['idSubstance', 'Name', 'Description',
'ConcentrationUnit', 'CramerClass', 'MolecularMass']
dataset.compounds.sheet = capeg.drop_duplicates(
subset = ['idSubstance'], ignore_index=True)[
dataset.substances.sheet = capeg.drop_duplicates(
subset=['idSubstance'], ignore_index=True)[
['idSubstance', 'Name']]
dataset.compounds.close(header=compounds_header)
dataset.substances.sheet.mcra.keepcolumn(substances_header)
# FOURTH The Assessment group memberships table
# Remove and add used columns to clear them
......@@ -170,7 +178,7 @@ agm_header = ['idGroupMembershipModel', 'idSubstance', 'GroupMembership']
dataset.agm.sheet = capeg[agm_header].drop_duplicates()
dataset.agm.close(header=agm_header)
dataset.agm.sheet.mcra.keepcolumn(agm_header)
# DONE
dataset.close()
dataset.close(file_report=True)
# Ignore everything in this directory
*
# Except this file
!.gitignore
\ No newline at end of file
# Ignore everything in this directory
*
# Except this file
!.gitignore
\ No newline at end of file
......@@ -14,10 +14,13 @@ import numpy as np
import math
import sys
import textwrap
import getpass
# For debugging purposes
# from objbrowser import browse
PY_INDENT = ' '
thisyear = datetime.now().strftime('%Y')
@pd.api.extensions.register_dataframe_accessor('mcra')
......@@ -50,9 +53,20 @@ class McraAccessor:
To easily add a bunch of empty columns
'''
for col in columnnames:
if not col in self._obj.columns:
if col not in self._obj.columns:
self._obj[col] = ''
def keepcolumn(self, columnnames):
'''
To easily format to fixed set of columns
'''
# Add missing ones, making them empty
for col in columnnames:
if col not in self._obj.columns:
self._obj[col] = ''
# Only retain the ones given.
self._obj = self._obj[columnnames]
def splitjoin(self, name, split, join,
split_sep='-', right_split=True, join_sep='-'):
'''
......@@ -67,7 +81,7 @@ class McraAccessor:
df[split] = self._obj[split].str.rsplit(split_sep, n=1).str[0]
df[name] = df.loc[:, (join, split)].apply(
lambda x: '-'.join(x.dropna()), axis=1)
lambda x: join_sep.join(x.dropna()), axis=1)
df = df.drop([join, split], axis=1)
# Not ideal yet, but slightly better than it used to be....
self._obj = self._obj.merge(df, left_index=True, right_index=True)
......@@ -228,43 +242,31 @@ class DataSheet:
self.sheet = None
self.checksum = checksum
self.properties = ''
self._report = ''
self.report = ''
self.direction = direction
self.autoload = autoload
self.closed = False
@property
def report(self):
return self._report
@report.setter
def report(self, report):
'''
Setting the report property adds text
'''
self._report = self._report+report
def make_report(self):
temp = self._report
def get_report(self):
report = ''
report += '* {dir} file: {file}\n'.format(
dir=self.direction,
file=os.path.split(self.file.path)[1])
report += textwrap.indent(
'* [{path}]({path})\n'.format(path=self.file.path), PY_INDENT)
report += textwrap.indent(
'* {props}\n'.format(props=self.properties), PY_INDENT)
report += textwrap.indent(
'* Modified: {mod}\n'.format(mod=self.file.modified), PY_INDENT)
report += textwrap.indent(
'* File size: {size_str} ({size} B)\n'.format(
size_str=self.file.size_string, size=self.file.size),
PY_INDENT)
report += textwrap.indent(
'* Hash: {hash}\n\n'.format(hash=self.file.hash), PY_INDENT)
self._report = report+temp
def clear_report(self):
self._report = ''
if self.direction == 'Input' or \
(self.direction == 'Output' and self.closed):
report += '* {dir} file: {file}\n'.format(
dir=self.direction, file=os.path.split(self.file.path)[1])
report += textwrap.indent(
'* [{path}]({path})\n'.format(path=self.file.path), PY_INDENT)
report += textwrap.indent(
'* {props}\n'.format(props=self.properties), PY_INDENT)
report += textwrap.indent(
'* Modified: {mod}\n'.format(mod=self.file.modified),
PY_INDENT)
report += textwrap.indent(
'* File size: {size_str} ({size} B)\n'.format(
size_str=self.file.size_string,
size=self.file.size), PY_INDENT)
report += textwrap.indent(
'* Hash: {hash}\n'.format(hash=self.file.hash), PY_INDENT)
return report
def update_properties(self):
if self.sheet is not None:
......@@ -301,7 +303,7 @@ class DataSheet:
elif self.file.extension == '.xls':
# Suppress warnings
wb = xlrd.open_workbook(self.file.path,
logfile=open(os.devnull, 'w'))
logfile=open(os.devnull, 'w'))
self.sheet = pd.read_excel(wb, engine='xlrd')
elif self.file.extension == '.md':
f = open(self.file.path, 'r')
......@@ -309,9 +311,9 @@ class DataSheet:
f.close()
else:
# Error here
print(' COULD NOT READ {file}'.format(file=self.file.path))
print(' COULD NOT READ {file}- unknown extenstion.'.format(
file=self.file.path))
self.update_properties()
self.make_report()
def save(self, **kwargs):
if self.file.extension == '.csv':
......@@ -333,13 +335,14 @@ class DataSheet:
self.file.path,
sheet_name=self.file.default_base,
**kwargs)
else:
print(' COULD NOT WRITE {file} - unknown extenstion.'.format(
file=self.file.path))
self.update_properties()
def close(self, header=False, auto_report=True, also_save=True):
def close(self, header=False, auto_report=False, also_save=True):
'''
If auto_report is False, no report on the object will me made.
If the report contains no content, it will not be created as file.
If however, you added something to the report, it WILL be created.
If auto_report is False, no automatic report will be made.
'''
if header:
# Make a sheet with the specified header in that order
......@@ -347,14 +350,16 @@ class DataSheet:
self.sheet = self.sheet[header]
if also_save:
self.save()
self.closed = True
self.file.update()
self.update_properties()
if auto_report:
self.make_report()
if len(self.report) > 0:
# Save report
with open(self.file.reportpath, 'w+') as f:
f.write(self.report)
self.report += self.get_report()
# We are no longer creating an md-report per file
# if len(self.report) > 0:
# # Save report
# with open(self.file.reportpath, 'w+') as f:
# f.write(self.report)
print('Output file: {file}; {props}'.format(
file=self.file.path,
props=self.properties))
......@@ -365,6 +370,10 @@ class DataSet:
self.args = None
self.parser = ArgumentParser(
description=description, epilog=epilog)
self.parser.add_argument(
'-r', '--report', nargs='?',
const='Output\\Report.md',
help='Creates a report file (default: %(const)s).')
# The verbosity argument will accept: -v, or -vv, -vvv etc.
# Set default to 1, so that basic output will always appear.
self.parser.add_argument(
......@@ -373,11 +382,22 @@ class DataSet:
self.parser.add_argument(
'-x', '--example', action='store_const', const='Example',
help='Uses input files from the %(const)s subdir.')
self.parser.add_argument(
'-z', '--zip', nargs='?', const='Output\\Output.zip',
help='Creates a zip file %(const)s containing all output.' +
' (default: %(const)s).')
if '-v' in sys.argv or '--verbosity' in sys.argv:
print(opening)
self.list = []
# Whether or not to create a zip file
self.zip = None
# The report for the entire dataset
self.report = ''
self.runtime = datetime.now().strftime('%H:%M:%S, %d %b %Y')
self.runcommand = ' '.join(sys.argv)
self.runarguments = ' '.join(sys.argv[1:])
self.scriptname = os.path.split(sys.argv[0])[1]
self.runuser = getpass.getuser()
# It is usefull to be able to iterate over all the datasheets.
# Basically, avoid using .list. in all DataSet references.
......@@ -412,7 +432,7 @@ class DataSet:
# But we must do some bookkeeping
self.list.append(name)
long_argument = '--'+name+'_file'
long_argument = '--' + name + '_file'
if type(help) == str and help is not SUPPRESS:
help = help + ' (default: {default})'.format(
default=default_name)
......@@ -500,34 +520,75 @@ class DataSet:
else:
# It is an Ouput file
base, ext = os.path.splitext(datasetfilename)
if ext == '.zip':
# In case of zip file, we will make a csv
datasetfilename = base + '.csv'
# and also put everything into a zip file
dataset.file.suggest(datasetfilename)
dataset.update_properties()
basezip, extzip = os.path.splitext(dataset.file.path)
dataset.file.suggest(datasetfilename)
dataset.update_properties()
if self.args.zip:
# Create a zip file containing everything
basezip, extzip = os.path.splitext(self.args.zip)
# Setting self.zip indicates creating a zip file
self.zip = basezip+'.zip'
else:
dataset.file.suggest(datasetfilename)
dataset.update_properties()
def close(self):
def save(self):
for data in self:
if data.direction == 'Output':
if not data.closed:
data.close(auto_report=False, also_save=True)
def close(self, file_report=False, save=True):
'''
Method to close the dataset.
Most importantly save files.
'''
report_content = ''
if file_report:
report_content += f'* Script: {self.scriptname}\n'
report_content += textwrap.indent(
f'* Command line: {self.runcommand}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Arguments: {self.runarguments}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Executed at: {self.runtime}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Executed by: {self.runuser}\n', PY_INDENT)
for data in self:
if data.direction == 'Input':
report_content += data.get_report()
# Closing every sheet first
for data in self:
if data.direction == 'Output':
if not data.closed:
data.close(auto_report=file_report, also_save=save)
if self.args.report:
# Collect reports per sheet.
for data in self:
report_content += data.report
if len(report_content) > 0:
# Report contains information.
if len(self.report) > 0:
self.report += '\n'
self.report += report_content
if len(self.report) > 0:
# Save report
with open(self.args.report, 'w+') as f:
f.write(self.report)
self.verbose(
1,
'Output file: {file}, containing report on output.'.format(
file=self.args.report))
if self.zip:
# All output files will be added to the zip file
self.verbose(
1,
'Output file: {file}, containing all output.'.format(
file=self.zip))
zip = zipfile.ZipFile(self.zip, 'w')
for data in self:
if data.direction == 'Output':
filename = os.path.split(data.file.reportpath)[1]
zip.write(data.file.reportpath, filename)
filename = os.path.split(data.file.path)[1]
zip.write(data.file.path, filename)
if self.args.report:
filename = os.path.split(self.args.report)[1]
zip.write(self.args.report, filename)
zip.close()
self.verbose(
1,
'Output file: {file}, containing all output.'.format(
file=self.zip))
......@@ -436,5 +436,4 @@ with pd.ExcelWriter(dataset.report.file.path, mode='a') as writer:
sheet_name='Substances')
dataset.report.close(auto_report=False, also_save=False)
dataset.close()
......@@ -132,7 +132,7 @@ header = [
'idFoodUnProcessed', 'FoodUnprocessedName',
'idProcessingType', 'ProcessingName',
'idSubstance', 'SubstanceName',
'FoodProcessedName',
'FoodProcessedName',
'Nominal', 'Upper',
'NominalUncertaintyUpper', 'UpperUncertaintyUpper',
'Study Reference'
......
#!/usr/bin/python
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
from dataconversion import DataSet, PY_INDENT, thisyear
import pandas as pd
import textwrap
# These are the files we work with
# Create list
dataset = DataSet(
opening='(c) ' + thisyear +
' Biometris, Wageningen University and Research.',
description='Creates an MCRA dataset from the ' +
'Processing Factors database on EFSA Zendono.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
#
# URL source file
efsa_url = 'https://zenodo.org/record/1488653/files/' \
+ 'EU_Processing_Factors_db_P.xlsx.xlsx?download=1'
#
# The input files
dataset.add(
name='efsa',
short_argument='-e',
help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
checksum='f816bf3928431d54f9d15fb134cc9106',
default_name=efsa_url,
default_dir='Input',
direction='Input',
autoload=False) # No autoload, because sheet is complex
#
# The output files
dataset.add(
name='processing_factor',
short_argument='-p',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.csv',
default_dir='Output')
#
dataset.add(
name='references',
short_argument='-f',
help='The (output) references file - '
+ 'format: csv (Comma Seperated).',
default_name='References.csv',
default_dir='Output')
#
#############################################################################
# Phase 1. Load data
dataset.init()
# Load the data
# Manually load the EFSA sheet, because the data is in a non-trivial place
efsa_sheet = 2
efsa_version = pd.read_excel(
dataset.efsa.file.path, sheet_name=efsa_sheet,
nrows=1, header=None).iloc[0, 0]
dataset.efsa.load(sheet_name=efsa_sheet, header=4)
dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
file=dataset.efsa.file.path,
props=dataset.efsa.properties,
version=efsa_version))
#
# Also reading the ProcStudies Evaluation; using panda directly
# Ok here, because it comes from same file, although not preferred
efsa_procstudies = pd.read_excel(dataset.efsa.file.path, sheet_name=1)
# ... and the References, go directly into the sheet.
dataset.references.sheet = pd.read_excel(dataset.efsa.file.path, sheet_name=3)
#############################################################################
# Phase 2. Processing the data.
# Let's first attack the efsa sheet, abbreviate to make life easier
efsa = dataset.efsa.sheet
# First let's copy the columns which we want in the output unaltered so far
efsa.mcra.copycolumn({
'Matrix Code': 'idFoodUnProcessed',
'Raw Primary Commodity': 'FoodUnprocessedName',
'KeyFacets Code': 'idProcessingType',
'KeyFacets Interpreted': 'ProcessingName',
'Matrix FoodEx2 Code': 'Matrix FoodEx2 Code',
'Matrix Code Interpreted': 'FoodProcessedName',
'ParamCode Active Substance': 'idSubstance',
'ParamName Active Substance': 'SubstanceName',
'Median PF': 'Nominal'
})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa.mcra.addcolumn({'Upper',
'NominalUncertaintyUpper',
'UpperUncertaintyUpper'})
# Combine with references
efsa_procstudies = efsa_procstudies.astype('str')
refs = efsa_procstudies.groupby(
['Matrix FoodEx2 Code', 'Study Reference']
).size().reset_index().sort_values(by=['Study Reference'])
refs = refs[['Matrix FoodEx2 Code', 'Study Reference']]
refs = refs.groupby(['Matrix FoodEx2 Code']).agg(
lambda column: ", ".join(column))
efsa = efsa.merge(
# Left join with processing type sheet,
refs,
left_on='Matrix FoodEx2 Code', right_on='Matrix FoodEx2 Code',
how='left').assign()