Commit 301b41bb authored by Kruisselbrink, Johannes's avatar Kruisselbrink, Johannes
Browse files

Update simple compilation script EU processing factors db; include substance...

Update simple compilation script EU processing factors db; include substance conversion and cleanup code
parent 71bcebff
#!/usr/bin/python
__version_info__ = ('1', '0', '0')
__version_info__ = ('1', '0', '1')
__version__ = '.'.join(__version_info__)
#############################################################################
......@@ -38,8 +38,22 @@ dataset.add(
direction='Input',
autoload=False) # No autoload, because sheet is complex
#
# Optional substances translation file
dataset.add(
name='substance_translation',
short_argument='-s',
help='The (input) substance translation file - '
+ 'format: csv (Comma Seperated), file not required.',
default_name='SubstanceTranslations.csv',
necessary=False,
default_dir='Input',
direction='Input')
#
# The output files
# The (main) processing factors table
dataset.add(
name='processing_factor',
short_argument='-p',
......@@ -47,7 +61,8 @@ dataset.add(
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.csv',
default_dir='Output')
#
# References
dataset.add(
name='references',
short_argument='-f',
......@@ -55,6 +70,7 @@ dataset.add(
+ 'format: csv (Comma Seperated).',
default_name='References.csv',
default_dir='Output')
#
#############################################################################
# Phase 1. Load data
......@@ -91,12 +107,30 @@ efsa.mcra.copycolumn({
'Raw Primary Commodity': 'FoodUnprocessedName',
'KeyFacets Code': 'idProcessingType',
'KeyFacets Interpreted': 'ProcessingName',
'Matrix FoodEx2 Code': 'Matrix FoodEx2 Code',
'Matrix FoodEx2 Code': 'idFoodProcessed',
'Matrix Code Interpreted': 'FoodProcessedName',
'ParamCode Active Substance': 'idSubstance',
'ParamName Active Substance': 'SubstanceName',
'Median PF': 'Nominal'
})
# See whether we can do something with Substance translation
if dataset.substance_translation.sheet is None:
# No substance translation? Just copy column
efsa.mcra.copycolumn(
{'ParamCode Active Substance': 'idSubstance'})
else:
# Get the from-code
dataset.substance_translation.sheet['FromCode']
# Do a left join
efsa = efsa.merge(
# Left join with processing type sheet,
dataset.substance_translation.sheet,
left_on='idSubstance', right_on='FromCode',
how='left').assign()
# Copy CASNumber to idSubstance column
efsa.mcra.copycolumn({'ToCode': 'idSubstance'})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
......@@ -120,41 +154,21 @@ efsa = efsa.merge(
how='left').assign()
# Done; dumping into proper sheet
dataset.processing_factor.sheet = efsa[efsa["idProcessingType"] != "-"]
# Before we can use data from the output files (e.g. hash), we first save it
# dataset.save()
header = [
'idFoodProcessed', 'Matrix Code Interpreted',
'idFoodUnProcessed', 'Matrix Code', 'RPC Code', 'FoodUnprocessedName',
'idProcessingType', 'KeyFacets Code', 'KeyFacets Interpreted',
'idSubstance', 'ParamCode Active Substance', 'SubstanceName',
'Nominal', 'Upper', 'NominalUncertaintyUpper',
'UpperUncertaintyUpper', 'KeyFacets Interpreted',
'Study Reference']
dataset.processing_factor.sheet = efsa[
(efsa["idProcessingType"] != "-") &
efsa['idSubstance'].notna()
][header]
#############################################################################
# Phase 3. Report about the data.
report = r'''CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
* Script: '''+dataset.scriptname+r'''
* Arguments: '''+dataset.runarguments+r'''
* Date: '''+dataset.runtime+r'''
* User: '''+dataset.runuser+r'''
* Files:
'''
for data in dataset:
if data.direction == 'Output':
report += textwrap.indent(data.get_report(), PY_INDENT)
report += r'''
EFSA Excel input details
========================
'''
for data in dataset:
if data.direction == 'Input':
report += data.get_report()
#
# Here's a self generated report
# dataset.report = report
# dataset.close()
# Here's an auto generated report
# Uncomment lines 121 and 149, 150 and comment line below
# to get original report back.
dataset.close(file_report=True)
#!/usr/bin/python
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
import sys
import mcra
import pandas as pd
from datetime import datetime
import textwrap
import os
# Small utility to create hyperlink to hyperlink :-)
def print_as_link(text):
return '[{text}]({text})'.format(text=text)
# These are the files we work with
# Create list
dataset = mcra.DataSet(
opening='(c) ' + datetime.now().strftime('%Y')
+ ' Biometris, Wageningen University and Research.',
description= 'Creates an MCRA dataset from the Processing Factors database on EFSA Zendono.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
#
# URL source file
efsa_url = 'https://zenodo.org/record/1488653/files/' \
+ 'EU_Processing_Factors_db_P.xlsx.xlsx?download=1'
#
# The input files
dataset.add(
name='efsa',
short_argument='-e',
help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
checksum='f816bf3928431d54f9d15fb134cc9106',
default_name=efsa_url,
default_dir='Input',
direction='Input',
autoload=False)
#
# The output files
dataset.add(
name='processing_factor',
short_argument='-o',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.zip',
# default_name='ProcessingFactors.csv',
default_dir='Output')
#
dataset.add(
name='references',
default_name='References.csv',
default_dir='Output')
#
#############################################################################
dataset.init()
# Manually load the EFSA sheet, because the data is in a non-trivial place
efsa_sheet = 2
efsa_version = pd.read_excel(
dataset.efsa.file.path, sheet_name=efsa_sheet,
nrows=1, header=None).iloc[0, 0]
dataset.efsa.load(sheet_name=efsa_sheet, header=4)
dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
file=dataset.efsa.file.path,
props=dataset.efsa.properties,
version=efsa_version))
#
# Also reading the ProcStudies Evaluation
efsa_procstudies = pd.read_excel(dataset.efsa.file.path, sheet_name=1)
# ... and the References
dataset.references.sheet = pd.read_excel(dataset.efsa.file.path, sheet_name=3)
#############################################################################
# Phase 2. Processing the data.
# Try to think SQL-wise or vector-wise about the data,
# but object-wise about the code
# We use the input Excel sheet as an SQL table, and supplement it
# until we have additional columns with all necessary data.
#
#
# Here we'll left join with both tables to supplement the original sheet.
# Then we have all data in one single dataframe (table).
efsa_combined = dataset.efsa.sheet
# First let's copy the columns which we want in the output unaltered so far
efsa_combined.mcra.copycolumn({
'Matrix Code': 'idFoodUnProcessed',
'Raw Primary Commodity': 'FoodUnprocessedName',
'KeyFacets Code': 'idProcessingType',
'KeyFacets Interpreted': 'ProcessingName',
'Matrix FoodEx2 Code': 'Matrix FoodEx2 Code',
'Matrix Code Interpreted': 'FoodProcessedName',
'ParamCode Active Substance': 'idSubstance',
'ParamName Active Substance': 'SubstanceName',
'Median PF': 'Nominal'
})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa_combined.mcra.addcolumn({'Upper',
'NominalUncertaintyUpper',
'UpperUncertaintyUpper'})
# We also have to add the references to the file.
efsa_procstudies = efsa_procstudies.astype('str')
refs = efsa_procstudies.groupby(
['Matrix FoodEx2 Code', 'Study Reference']
).size().reset_index().sort_values(by=['Study Reference'])
refs = refs[['Matrix FoodEx2 Code', 'Study Reference']]
refs = refs.groupby(['Matrix FoodEx2 Code']).agg(
lambda column: ", ".join(column))
efsa_combined = efsa_combined.merge(
# Left join with processing type sheet,
refs,
left_on='Matrix FoodEx2 Code', right_on='Matrix FoodEx2 Code',
how='left').assign()
#############################################################################
# Phase 3. Exporting the data.
# Seems obvious what to do here.
#
header = [
'idFoodUnProcessed', 'FoodUnprocessedName',
'idProcessingType', 'ProcessingName',
'idSubstance', 'SubstanceName',
'FoodProcessedName',
'Nominal', 'Upper',
'NominalUncertaintyUpper', 'UpperUncertaintyUpper',
'Study Reference'
]
dataset.processing_factor.sheet = efsa_combined[efsa_combined["idProcessingType"]!="-"]
#
# Writing references file
dataset.references.close()
#############################################################################
# Phase 4. Report about the data.
# We now have table with all relevant data.
# Here we'll gather some statistics about the table.
# Let's create the report as we go along, to avoid unnecessay variables.
mismatch_table = efsa_combined.loc[(efsa_combined['idProcessingType'].isna())]
dataset.processing_factor.report = r'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
* Script: Convert-Simple.py
* Arguments: '''
for arg in sys.argv:
dataset.processing_factor.report = textwrap.indent(arg, mcra.PY_INDENT)
dataset.processing_factor.report=r'''
* Date: '''+datetime.now().strftime('%H:%M:%S, %d %b %Y')+r'''
* Files:
'''
for data in dataset:
if data.direction == 'Input':
dataset.processing_factor.report = textwrap.indent(data.report, mcra.PY_INDENT)
for datasetname in dataset.list:
# Bit of a hack, figure out later how this can be properly done.
if getattr(dataset, datasetname).direction == 'Output' \
and datasetname != 'report':
dataset.processing_factor.report = textwrap.indent(
getattr(dataset, datasetname).report, mcra.PY_INDENT)
dataset.processing_factor.report = r'''
EFSA Excel input details
========================
* Excel input: '''+print_as_link(dataset.efsa.file.path)+r'''
* '''+efsa_version+r'''
* '''+dataset.efsa.properties+r'''
* Modified: '''+dataset.efsa.file.modified+r'''
'''
#
# Writing output file
dataset.processing_factor.close()
dataset.close()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment