Commit 5da6afe7 authored by Kruisselbrink, Johannes's avatar Kruisselbrink, Johannes
Browse files

Add alternative simple script to generate an MCRA processing factors using the...

Add alternative simple script to generate an MCRA processing factors using the EFSA codes (without any translations)
parent 85b930a8
#!/usr/bin/python
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
import sys
import mcra
import pandas as pd
from datetime import datetime
import textwrap
import os
# Small utility to create hyperlink to hyperlink :-)
def print_as_link(text):
return '[{text}]({text})'.format(text=text)
# These are the files we work with
# Create list
dataset = mcra.DataSet(
opening='(c) ' + datetime.now().strftime('%Y')
+ ' Biometris, Wageningen University and Research.',
description= 'Creates an MCRA dataset from the Processing Factors database on EFSA Zendono.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
#
# URL source file
efsa_url = 'https://zenodo.org/record/1488653/files/' \
+ 'EU_Processing_Factors_db_P.xlsx.xlsx?download=1'
#
# The input files
dataset.add(
name='efsa',
short_argument='-e',
help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
checksum='f816bf3928431d54f9d15fb134cc9106',
default_name=efsa_url,
default_dir='Input',
direction='Input',
autoload=False)
#
# The output files
dataset.add(
name='processing_factor',
short_argument='-o',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.zip',
# default_name='ProcessingFactors.csv',
default_dir='Output')
#
dataset.add(
name='references',
default_name='References.csv',
default_dir='Output')
#
#############################################################################
dataset.init()
# Manually load the EFSA sheet, because the data is in a non-trivial place
efsa_sheet = 2
efsa_version = pd.read_excel(
dataset.efsa.file.path, sheet_name=efsa_sheet,
nrows=1, header=None).iloc[0, 0]
dataset.efsa.load(sheet_name=efsa_sheet, header=4)
dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
file=dataset.efsa.file.path,
props=dataset.efsa.properties,
version=efsa_version))
#
# Also reading the ProcStudies Evaluation
efsa_procstudies = pd.read_excel(dataset.efsa.file.path, sheet_name=1)
# ... and the References
dataset.references.sheet = pd.read_excel(dataset.efsa.file.path, sheet_name=3)
#############################################################################
# Phase 2. Processing the data.
# Try to think SQL-wise or vector-wise about the data,
# but object-wise about the code
# We use the input Excel sheet as an SQL table, and supplement it
# until we have additional columns with all necessary data.
#
#
# Here we'll left join with both tables to supplement the original sheet.
# Then we have all data in one single dataframe (table).
efsa_combined = dataset.efsa.sheet
# First let's copy the columns which we want in the output unaltered so far
efsa_combined.mcra.copycolumn({
'Matrix Code': 'idFoodUnProcessed',
'Raw Primary Commodity': 'FoodUnprocessedName',
'KeyFacets Code': 'idProcessingType',
'KeyFacets Interpreted': 'ProcessingName',
'Matrix FoodEx2 Code': 'Matrix FoodEx2 Code',
'Matrix Code Interpreted': 'FoodProcessedName',
'ParamCode Active Substance': 'idSubstance',
'ParamName Active Substance': 'SubstanceName',
'Median PF': 'Nominal'
})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa_combined.mcra.addcolumn({'Upper',
'NominalUncertaintyUpper',
'UpperUncertaintyUpper'})
# We also have to add the references to the file.
efsa_procstudies = efsa_procstudies.astype('str')
refs = efsa_procstudies.groupby(
['Matrix FoodEx2 Code', 'Study Reference']
).size().reset_index().sort_values(by=['Study Reference'])
refs = refs[['Matrix FoodEx2 Code', 'Study Reference']]
refs = refs.groupby(['Matrix FoodEx2 Code']).agg(
lambda column: ", ".join(column))
efsa_combined = efsa_combined.merge(
# Left join with processing type sheet,
refs,
left_on='Matrix FoodEx2 Code', right_on='Matrix FoodEx2 Code',
how='left').assign()
#############################################################################
# Phase 3. Exporting the data.
# Seems obvious what to do here.
#
header = [
'idFoodUnProcessed', 'FoodUnprocessedName',
'idProcessingType', 'ProcessingName',
'idSubstance', 'SubstanceName',
'FoodProcessedName',
'Nominal', 'Upper',
'NominalUncertaintyUpper', 'UpperUncertaintyUpper',
'Study Reference'
]
dataset.processing_factor.sheet = efsa_combined[efsa_combined["idProcessingType"]!="-"]
#
# Writing references file
dataset.references.close()
#############################################################################
# Phase 4. Report about the data.
# We now have table with all relevant data.
# Here we'll gather some statistics about the table.
# Let's create the report as we go along, to avoid unnecessay variables.
mismatch_table = efsa_combined.loc[(efsa_combined['idProcessingType'].isna())]
dataset.processing_factor.report = r'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
* Script: Convert-Simple.py
* Arguments: '''
for arg in sys.argv:
dataset.processing_factor.report = textwrap.indent(arg, mcra.PY_INDENT)
dataset.processing_factor.report=r'''
* Date: '''+datetime.now().strftime('%H:%M:%S, %d %b %Y')+r'''
* Files:
'''
for data in dataset:
if data.direction == 'Input':
dataset.processing_factor.report = textwrap.indent(data.report, mcra.PY_INDENT)
for datasetname in dataset.list:
# Bit of a hack, figure out later how this can be properly done.
if getattr(dataset, datasetname).direction == 'Output' \
and datasetname != 'report':
dataset.processing_factor.report = textwrap.indent(
getattr(dataset, datasetname).report, mcra.PY_INDENT)
dataset.processing_factor.report = r'''
EFSA Excel input details
========================
* Excel input: '''+print_as_link(dataset.efsa.file.path)+r'''
* '''+efsa_version+r'''
* '''+dataset.efsa.properties+r'''
* Modified: '''+dataset.efsa.file.modified+r'''
'''
#
# Writing output file
dataset.processing_factor.close()
dataset.close()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment