From 5da6afe7e6804e4abb929e15c512bd41f1050840 Mon Sep 17 00:00:00 2001
From: Johannes Kruisselbrink <johannes.kruisselbrink@wur.nl>
Date: Wed, 7 Jul 2021 10:15:43 +0200
Subject: [PATCH] Add alternative simple script to generate an MCRA processing
 factors using the EFSA codes (without any translations)

---
 .../Convert-Simple.py                         | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 Convert-EUProcessingFactorsDB/Convert-Simple.py

diff --git a/Convert-EUProcessingFactorsDB/Convert-Simple.py b/Convert-EUProcessingFactorsDB/Convert-Simple.py
new file mode 100644
index 0000000..0a6d3a3
--- /dev/null
+++ b/Convert-EUProcessingFactorsDB/Convert-Simple.py
@@ -0,0 +1,193 @@
+#!/usr/bin/python
+#############################################################################
+# Phase 0. Initialization
+# Doing stuff like parsing arguments, and reading the files.
+#
+import sys
+import mcra
+import pandas as pd
+from datetime import datetime
+import textwrap
+import os
+
+# Small utility to create hyperlink to hyperlink :-)
+def print_as_link(text):
+    return '[{text}]({text})'.format(text=text)
+
+# These are the files we work with
+# Create list
+dataset = mcra.DataSet(
+    opening='(c) ' + datetime.now().strftime('%Y')
+            + ' Biometris, Wageningen University and Research.',
+    description= 'Creates an MCRA dataset from the Processing Factors database on EFSA Zendono.',
+    epilog='For example: use %(prog)s -v -x for a verbose example.')
+
+#
+# URL source file
+efsa_url = 'https://zenodo.org/record/1488653/files/' \
+        + 'EU_Processing_Factors_db_P.xlsx.xlsx?download=1'
+
+#
+# The input files
+dataset.add(
+    name='efsa',
+    short_argument='-e',
+    help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
+    checksum='f816bf3928431d54f9d15fb134cc9106',
+    default_name=efsa_url,
+    default_dir='Input',
+    direction='Input',
+    autoload=False)
+
+#
+# The output files
+dataset.add(
+    name='processing_factor',
+    short_argument='-o',
+    help='The (output) processing factor file - '
+         + 'format: csv (Comma Seperated).',
+    default_name='ProcessingFactors.zip',
+    # default_name='ProcessingFactors.csv',
+    default_dir='Output')
+#
+dataset.add(
+    name='references',
+    default_name='References.csv',
+    default_dir='Output')
+#
+
+#############################################################################
+
+dataset.init()
+
+# Manually load the EFSA sheet, because the data is in a non-trivial place
+efsa_sheet = 2
+efsa_version = pd.read_excel(
+    dataset.efsa.file.path, sheet_name=efsa_sheet,
+    nrows=1, header=None).iloc[0, 0]
+
+dataset.efsa.load(sheet_name=efsa_sheet, header=4)
+dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
+    file=dataset.efsa.file.path,
+    props=dataset.efsa.properties,
+    version=efsa_version))
+#
+# Also reading the ProcStudies Evaluation
+efsa_procstudies = pd.read_excel(dataset.efsa.file.path, sheet_name=1)
+# ... and the References
+dataset.references.sheet = pd.read_excel(dataset.efsa.file.path, sheet_name=3)
+
+#############################################################################
+# Phase 2. Processing the data.
+# Try to think SQL-wise or vector-wise about the data,
+# but object-wise about the code
+# We use the input Excel sheet as an SQL table, and supplement it
+# until we have additional columns with all necessary data.
+#
+#
+# Here we'll left join with both tables to supplement the original sheet.
+# Then we have all data in one single dataframe (table).
+
+efsa_combined = dataset.efsa.sheet
+
+# First let's copy the columns which we want in the output unaltered so far
+efsa_combined.mcra.copycolumn({
+    'Matrix Code': 'idFoodUnProcessed',
+    'Raw Primary Commodity': 'FoodUnprocessedName',
+    'KeyFacets Code': 'idProcessingType',
+    'KeyFacets Interpreted': 'ProcessingName',
+    'Matrix FoodEx2 Code': 'Matrix FoodEx2 Code',
+    'Matrix Code Interpreted': 'FoodProcessedName',
+    'ParamCode Active Substance': 'idSubstance',
+    'ParamName Active Substance': 'SubstanceName',
+    'Median PF': 'Nominal'
+})
+#
+# Then let's add columns which will be empty
+# so to be able to create a proper output file
+efsa_combined.mcra.addcolumn({'Upper',
+                              'NominalUncertaintyUpper',
+                              'UpperUncertaintyUpper'})
+
+# We also have to add the references to the file.
+efsa_procstudies = efsa_procstudies.astype('str')
+refs = efsa_procstudies.groupby(
+    ['Matrix FoodEx2 Code', 'Study Reference']
+    ).size().reset_index().sort_values(by=['Study Reference'])
+refs = refs[['Matrix FoodEx2 Code', 'Study Reference']]
+refs = refs.groupby(['Matrix FoodEx2 Code']).agg(
+    lambda column: ", ".join(column))
+
+efsa_combined = efsa_combined.merge(
+        # Left join with processing type sheet,
+        refs,
+        left_on='Matrix FoodEx2 Code', right_on='Matrix FoodEx2 Code',
+        how='left').assign()
+
+#############################################################################
+# Phase 3. Exporting the data.
+# Seems obvious what to do here.
+#
+header = [
+    'idFoodUnProcessed', 'FoodUnprocessedName',
+    'idProcessingType', 'ProcessingName',
+    'idSubstance', 'SubstanceName',
+    'FoodProcessedName', 
+    'Nominal', 'Upper',
+    'NominalUncertaintyUpper', 'UpperUncertaintyUpper',
+    'Study Reference'
+]
+
+dataset.processing_factor.sheet =  efsa_combined[efsa_combined["idProcessingType"]!="-"]
+
+#
+# Writing references file
+dataset.references.close()
+
+#############################################################################
+# Phase 4. Report about the data.
+# We now have table with all relevant data.
+# Here we'll gather some statistics about the table.
+# Let's create the report as we go along, to avoid unnecessay variables.
+mismatch_table = efsa_combined.loc[(efsa_combined['idProcessingType'].isna())]
+
+dataset.processing_factor.report = r'''
+CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
+------------------------------------------------------
+
+* Script: Convert-Simple.py
+* Arguments: '''
+for arg in sys.argv:
+    dataset.processing_factor.report = textwrap.indent(arg, mcra.PY_INDENT)
+dataset.processing_factor.report=r'''
+* Date: '''+datetime.now().strftime('%H:%M:%S, %d %b %Y')+r'''
+* Files:
+'''
+for data in dataset:
+    if data.direction == 'Input':
+        dataset.processing_factor.report = textwrap.indent(data.report, mcra.PY_INDENT)
+
+for datasetname in dataset.list:
+    # Bit of a hack, figure out later how this can be properly done.
+    if getattr(dataset, datasetname).direction == 'Output' \
+       and datasetname != 'report':
+        dataset.processing_factor.report = textwrap.indent(
+            getattr(dataset, datasetname).report, mcra.PY_INDENT)
+
+dataset.processing_factor.report = r'''
+
+EFSA Excel input details
+========================
+
+* Excel input: '''+print_as_link(dataset.efsa.file.path)+r'''
+    * '''+efsa_version+r'''
+    * '''+dataset.efsa.properties+r'''
+    * Modified: '''+dataset.efsa.file.modified+r'''
+
+'''
+
+#
+# Writing output file
+dataset.processing_factor.close()
+
+dataset.close()
-- 
GitLab