From 5da6afe7e6804e4abb929e15c512bd41f1050840 Mon Sep 17 00:00:00 2001 From: Johannes Kruisselbrink <johannes.kruisselbrink@wur.nl> Date: Wed, 7 Jul 2021 10:15:43 +0200 Subject: [PATCH] Add alternative simple script to generate an MCRA processing factors using the EFSA codes (without any translations) --- .../Convert-Simple.py | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 Convert-EUProcessingFactorsDB/Convert-Simple.py diff --git a/Convert-EUProcessingFactorsDB/Convert-Simple.py b/Convert-EUProcessingFactorsDB/Convert-Simple.py new file mode 100644 index 0000000..0a6d3a3 --- /dev/null +++ b/Convert-EUProcessingFactorsDB/Convert-Simple.py @@ -0,0 +1,193 @@ +#!/usr/bin/python +############################################################################# +# Phase 0. Initialization +# Doing stuff like parsing arguments, and reading the files. +# +import sys +import mcra +import pandas as pd +from datetime import datetime +import textwrap +import os + +# Small utility to create hyperlink to hyperlink :-) +def print_as_link(text): + return '[{text}]({text})'.format(text=text) + +# These are the files we work with +# Create list +dataset = mcra.DataSet( + opening='(c) ' + datetime.now().strftime('%Y') + + ' Biometris, Wageningen University and Research.', + description= 'Creates an MCRA dataset from the Processing Factors database on EFSA Zendono.', + epilog='For example: use %(prog)s -v -x for a verbose example.') + +# +# URL source file +efsa_url = 'https://zenodo.org/record/1488653/files/' \ + + 'EU_Processing_Factors_db_P.xlsx.xlsx?download=1' + +# +# The input files +dataset.add( + name='efsa', + short_argument='-e', + help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ', + checksum='f816bf3928431d54f9d15fb134cc9106', + default_name=efsa_url, + default_dir='Input', + direction='Input', + autoload=False) + +# +# The output files +dataset.add( + name='processing_factor', + short_argument='-o', + help='The (output) processing factor file - ' + + 'format: csv (Comma Seperated).', + default_name='ProcessingFactors.zip', + # default_name='ProcessingFactors.csv', + default_dir='Output') +# +dataset.add( + name='references', + default_name='References.csv', + default_dir='Output') +# + +############################################################################# + +dataset.init() + +# Manually load the EFSA sheet, because the data is in a non-trivial place +efsa_sheet = 2 +efsa_version = pd.read_excel( + dataset.efsa.file.path, sheet_name=efsa_sheet, + nrows=1, header=None).iloc[0, 0] + +dataset.efsa.load(sheet_name=efsa_sheet, header=4) +dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format( + file=dataset.efsa.file.path, + props=dataset.efsa.properties, + version=efsa_version)) +# +# Also reading the ProcStudies Evaluation +efsa_procstudies = pd.read_excel(dataset.efsa.file.path, sheet_name=1) +# ... and the References +dataset.references.sheet = pd.read_excel(dataset.efsa.file.path, sheet_name=3) + +############################################################################# +# Phase 2. Processing the data. +# Try to think SQL-wise or vector-wise about the data, +# but object-wise about the code +# We use the input Excel sheet as an SQL table, and supplement it +# until we have additional columns with all necessary data. +# +# +# Here we'll left join with both tables to supplement the original sheet. +# Then we have all data in one single dataframe (table). + +efsa_combined = dataset.efsa.sheet + +# First let's copy the columns which we want in the output unaltered so far +efsa_combined.mcra.copycolumn({ + 'Matrix Code': 'idFoodUnProcessed', + 'Raw Primary Commodity': 'FoodUnprocessedName', + 'KeyFacets Code': 'idProcessingType', + 'KeyFacets Interpreted': 'ProcessingName', + 'Matrix FoodEx2 Code': 'Matrix FoodEx2 Code', + 'Matrix Code Interpreted': 'FoodProcessedName', + 'ParamCode Active Substance': 'idSubstance', + 'ParamName Active Substance': 'SubstanceName', + 'Median PF': 'Nominal' +}) +# +# Then let's add columns which will be empty +# so to be able to create a proper output file +efsa_combined.mcra.addcolumn({'Upper', + 'NominalUncertaintyUpper', + 'UpperUncertaintyUpper'}) + +# We also have to add the references to the file. +efsa_procstudies = efsa_procstudies.astype('str') +refs = efsa_procstudies.groupby( + ['Matrix FoodEx2 Code', 'Study Reference'] + ).size().reset_index().sort_values(by=['Study Reference']) +refs = refs[['Matrix FoodEx2 Code', 'Study Reference']] +refs = refs.groupby(['Matrix FoodEx2 Code']).agg( + lambda column: ", ".join(column)) + +efsa_combined = efsa_combined.merge( + # Left join with processing type sheet, + refs, + left_on='Matrix FoodEx2 Code', right_on='Matrix FoodEx2 Code', + how='left').assign() + +############################################################################# +# Phase 3. Exporting the data. +# Seems obvious what to do here. +# +header = [ + 'idFoodUnProcessed', 'FoodUnprocessedName', + 'idProcessingType', 'ProcessingName', + 'idSubstance', 'SubstanceName', + 'FoodProcessedName', + 'Nominal', 'Upper', + 'NominalUncertaintyUpper', 'UpperUncertaintyUpper', + 'Study Reference' +] + +dataset.processing_factor.sheet = efsa_combined[efsa_combined["idProcessingType"]!="-"] + +# +# Writing references file +dataset.references.close() + +############################################################################# +# Phase 4. Report about the data. +# We now have table with all relevant data. +# Here we'll gather some statistics about the table. +# Let's create the report as we go along, to avoid unnecessay variables. +mismatch_table = efsa_combined.loc[(efsa_combined['idProcessingType'].isna())] + +dataset.processing_factor.report = r''' +CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET +------------------------------------------------------ + +* Script: Convert-Simple.py +* Arguments: ''' +for arg in sys.argv: + dataset.processing_factor.report = textwrap.indent(arg, mcra.PY_INDENT) +dataset.processing_factor.report=r''' +* Date: '''+datetime.now().strftime('%H:%M:%S, %d %b %Y')+r''' +* Files: +''' +for data in dataset: + if data.direction == 'Input': + dataset.processing_factor.report = textwrap.indent(data.report, mcra.PY_INDENT) + +for datasetname in dataset.list: + # Bit of a hack, figure out later how this can be properly done. + if getattr(dataset, datasetname).direction == 'Output' \ + and datasetname != 'report': + dataset.processing_factor.report = textwrap.indent( + getattr(dataset, datasetname).report, mcra.PY_INDENT) + +dataset.processing_factor.report = r''' + +EFSA Excel input details +======================== + +* Excel input: '''+print_as_link(dataset.efsa.file.path)+r''' + * '''+efsa_version+r''' + * '''+dataset.efsa.properties+r''' + * Modified: '''+dataset.efsa.file.modified+r''' + +''' + +# +# Writing output file +dataset.processing_factor.close() + +dataset.close() -- GitLab