From ad32c5840dfe422d8ae9154215a22db5f5c5a8c2 Mon Sep 17 00:00:00 2001
From: Hans van den Heuvel <hans1.vandenheuvel@wur.nl>
Date: Thu, 26 Mar 2020 14:52:45 +0100
Subject: [PATCH] Added input option for FoodComposition translation.

---
 .../Convert-EUProcessingFactorsDB.py          | 60 +++++++++++++---
 Convert-EUProcessingFactorsDB/README.md       | 10 ++-
 Convert-EUProcessingFactorsDB/mcra.py         | 68 +++++++++++++------
 3 files changed, 107 insertions(+), 31 deletions(-)

diff --git a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
index e327ee6..7a45068 100644
--- a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
+++ b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
@@ -16,6 +16,7 @@ def print_as_link(text):
 # These are the files we work with
 # Create list
 dataset = mcra.DataSet(
+    opening='(c) 2020 Euromix, Biometris, WUR.',
     description='Converts the EFSA Zendono Excel sheet into an MCRA '
     + 'conforming format, using some external translation files.',
     epilog='For example: use %(prog)s -v -x for a verbose example.')
@@ -28,8 +29,7 @@ efsa_url = 'https://zenodo.org/record/1488653/files/' \
 dataset.add(
     name='efsa',
     short_argument='-e',
-    help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. '
-         + '(default: %(default)s)',
+    help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
     checksum='f816bf3928431d54f9d15fb134cc9106',
     default_name=efsa_url,
     default_dir='Input',
@@ -39,8 +39,8 @@ dataset.add(
 dataset.add(
     name='processing_type',
     short_argument='-t',
-    help='The (input) processing type file - format: csv (Comma Seperated).'
-         + '(default: %(default)s)',
+    help='The (input) processing type file - '
+        + 'format: csv (Comma Seperated).',
     default_name='ProcessingTypes.csv',
     default_dir='Input',
     direction='Input')
@@ -49,7 +49,7 @@ dataset.add(
     name='processing_translation',
     short_argument='-p',
     help='The (input) processing translation file - '
-         + 'format: csv (Comma Seperated). (default: %(default)s)',
+         + 'format: csv (Comma Seperated).',
     default_name='ProcTypeTranslations.csv',
     default_dir='Input',
     direction='Input')
@@ -58,7 +58,7 @@ dataset.add(
     name='food_translation',
     short_argument='-f',
     help='The (input) food translation file - '
-         + 'format: csv (Comma Seperated). (default: %(default)s)',
+         + 'format: csv (Comma Seperated).',
     default_name='FoodTranslations.csv',
     default_dir='Input',
     direction='Input')
@@ -67,18 +67,29 @@ dataset.add(
     name='substance_translation',
     short_argument='-s',
     help='The (input) substance translation file - '
-         + 'format: csv (Comma Seperated). (default: %(default)s)',
+         + 'format: tsv (Tab Seperated), file not required.',
     default_name='SubstanceTranslations.tsv',
     necessary=False,
     default_dir='Input',
     direction='Input')
 #
+dataset.add(
+    name='food_composition',
+    short_argument='-g',
+    help='The (input) food composition file - '
+         + 'format: xlsx (Excel), file not required.',
+    default_name='FoodComposition.xlsx',
+    necessary=False,
+    default_dir='Input',
+    direction='Input',
+    autoload=False)
+#
 # The output files
 dataset.add(
     name='processing_factor',
     short_argument='-o',
     help='The (output) processing factor file - '
-         + 'format: csv (Comma Seperated). (default: %(default)s)',
+         + 'format: csv (Comma Seperated).',
     default_name='ProcessingFactors.zip',
     # default_name='ProcessingFactors.csv',
     default_dir='Output')
@@ -110,6 +121,14 @@ dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
     props=dataset.efsa.properties,
     version=efsa_version))
 #
+# Use this file only if called explictly from command line
+# and of course, it has to exist. The -g is enough to trigger the default
+if dataset.args.food_composition_file is not None \
+    and dataset.food_composition.file.exist:
+    dataset.food_composition.load(sheet_name='FoodTranslation')
+    dataset.verbose(1, 'Input file : {file}; {props}'.format(
+    file=dataset.food_composition.file.path,
+    props=dataset.food_composition.properties))
 
 
 #############################################################################
@@ -206,6 +225,31 @@ efsa_combined.loc[
                          + '-' + efsa_combined['idProcessingType'].astype(str)
 #
 
+if dataset.food_composition.sheet is not None:
+    # We also have to do the food_composition translation
+    # First remove all but keep the P-code data
+    # Also use shorter name:
+    fcs = dataset.food_composition.sheet[
+        dataset.food_composition.sheet['idToFood'].str.startswith('P')]
+    # Now split the first column
+    fs=pd.DataFrame()
+    # Bit of a mess, to combine again.
+    fs[['idFromFood-Left', 'idFromFood-Right']] = \
+        fcs['idFromFood'].str.rsplit('-', n=1, expand=True)
+    fcs=fcs.merge(fs, left_index=True, right_index=True)
+    # New columns is properly joined now
+    fcs['idToFood-PC'] = fcs.loc[:, ('idToFood', 'idFromFood-Right')].apply(
+        lambda x: '-'.join(x.dropna()), axis=1)
+    # Finally a left join to combine
+    efsa_combined = efsa_combined.merge(
+        # Left join with processing type sheet,
+        fcs,
+        left_on='idFoodProcessed', right_on='idToFood-PC',
+        how='left').assign()
+    efsa_combined.loc[
+        (efsa_combined['idToFood-PC'].notna()),
+        'idFoodProcessed'] = efsa_combined['idFromFood']
+    print(fcs)
 #############################################################################
 # Phase 3. Exporting the data.
 # Seems obvious what to do here.
diff --git a/Convert-EUProcessingFactorsDB/README.md b/Convert-EUProcessingFactorsDB/README.md
index 44c857a..6263274 100644
--- a/Convert-EUProcessingFactorsDB/README.md
+++ b/Convert-EUProcessingFactorsDB/README.md
@@ -39,8 +39,11 @@ These are the input and output files of the script. All names are defaults, and
   * A csv file with a summary (and counts) of *the remaining data* of the EU sheet, called [Mismatches.csv](Mismatches.csv).
 
 The following is happening in the script, essentially ([more details here](#detailed-workings))
-* The script wil try to match the first column (``FromFC``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) to the column ``KeyFacets Code`` of the EU sheet. If a match is found, then the second column (``FCToProcType``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) will become the ``idProcessingType`` of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)).
-* Then the script will try to match the ``FromFX`` column of [FoodTranslations.csv](FoodTranslations.csv) with the column ``Matrix FoodEx2 Code`` from the EU sheet, *for all rows that didn't already match in the previous step*. If a match was found, then the value of ``FXToProcType`` will be copied to ``idProcessingType`` of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)).
+* The script wil try to match the first column (``FromFC``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) to the column ``KeyFacets Code`` of the EU sheet. If a match is found, then the second column (``FCToProcType``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) will become the field ``idProcessingType``.
+* Then the script will try to match both the ``FromFX`` and ``FXToRpc`` column of [FoodTranslations.csv](FoodTranslations.csv) with the columns ``Matrix FoodEx2 Code`` and ``Matrix Code`` from the EU sheet, *for all rows that didn't already match in the previous step*. If a match was found, then the value of ``FXToProcType`` will be copied to ``idProcessingType``.
+* If no substance file was given, then just copy the field ``ParamCode Active Substance`` to ``idSubstance``. But if a substance was given, then strip the dash from the ``'CASNumber`` column in the substance file, and match the column ``ParamCode Active Substance`` in the EFSA sheet to ``code`` in the substances sheet. If a match was found then copy the modified (without dash) ``CASNumber`` to ``idSubstance``.
+* If a foodcompositions file was given, then an additional translation is done. This table needs to have the layout of the MCRA FoodComposition.
+*  of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip))
 * Finally the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)) will be written, together with some reports.
 
 ## Prerequisites
@@ -50,10 +53,11 @@ In order to use the python script, the following libraries are necessary
   * [xlrd](https://pypi.org/project/xlrd/)
   * [tabulate](https://pypi.org/project/tabulate/)
   * [openpyxl](https://pypi.org/project/openpyxl/)
+  * [requests](https://pypi.org/project/requests/)
 
 Install all the libraries at once with
 ```
-pip install pandas xlrd tabulate openpyxl
+pip install pandas xlrd tabulate openpyxl requests
 ```
 
 ## Usage
diff --git a/Convert-EUProcessingFactorsDB/mcra.py b/Convert-EUProcessingFactorsDB/mcra.py
index 2a73b6d..f674029 100644
--- a/Convert-EUProcessingFactorsDB/mcra.py
+++ b/Convert-EUProcessingFactorsDB/mcra.py
@@ -13,6 +13,7 @@ import requests
 import hashlib
 import numpy as np
 import math
+import sys
 # For debugging purposes
 # from objbrowser import browse
 
@@ -24,6 +25,13 @@ class McraAccessor:
     def __init__(self, pandas_obj):
         self._obj = pandas_obj
 
+        # To easily join two columns
+        def here_concat(*args):
+            strs = [str(arg) for arg in args if not pd.isnull(arg)]
+            return '-'.join(strs) if strs else np.nan
+
+        self.concat = np.vectorize(here_concat)
+
     # To easily copy a bunch of columns
     def copycolumn(self, columnnames):
         for fromcol, tocol in columnnames.items():
@@ -321,18 +329,23 @@ class DataSheet:
 
 
 class DataSet:
-    def __init__(self, description=None, epilog=None):
+    def __init__(self, opening=None, description=None, epilog=None):
         # The arguments object to use.
         self.args = None
+        # Delay parsing help, to peek ahead at verbosity...
         self.parser = ArgumentParser(
             description=description, epilog=epilog)
         # The verbosity argument will accept: -v, or -vv, -vvv etc.
+        # Set default to 1, so that basic output will always appear.
         self.parser.add_argument(
             '-v', '--verbosity', help="Show verbose output",
-            action="count", default=0)
+            action="count", default=1)
         self.parser.add_argument(
             '-x', '--example', action='store_const', const='Example',
             help='Uses input files from the %(const)s subdir.')
+        # Look ahead to check whether verbosity is used.
+        if '-v' in sys.argv or '--verbosity' in sys.argv:
+            print(opening)
         # The list of sheets
         self.list = []
 
@@ -357,20 +370,23 @@ class DataSet:
             setattr(self, name, DataSheet(default_name=default_name,
                     default_dir=default_dir, direction=direction,
                     checksum=checksum, autoload=autoload,
-                    necessary=necessary,**kwargs))
+                    necessary=necessary, **kwargs))
             # Add to list
             self.list.append(name)
             # Also set-up arguments if necessary
             long_argument = '--'+name+'_file'
+            if type(help) == str and help is not SUPPRESS:
+                help = help + ' (default: {default})'.format(
+                    default=default_name)
             if short_argument is None:
                 self.parser.add_argument(
-                    long_argument,
-                    default=getattr(self, name).file.default_name,
+                    long_argument, nargs='?',
+                    const=getattr(self, name).file.default_name,
                     help=help)
             else:
                 self.parser.add_argument(
-                    short_argument, long_argument,
-                    default=getattr(self, name).file.default_name,
+                    short_argument, long_argument, nargs='?',
+                    const=getattr(self, name).file.default_name,
                     help=help)
             if checksum is not None:
                 self.parser.add_argument(
@@ -395,16 +411,22 @@ class DataSet:
         # Go through all files and set filenames
         for datasetname in self.list:
             dataset = getattr(self, datasetname)
+            if getattr(self.args, datasetname+'_file') is None:
+                # Argument was not used
+                datasetfilename = dataset.file.default_name
+            else:
+                datasetfilename = getattr(self.args, datasetname+'_file')
+            #
             if dataset.direction == 'Input':
-                # If example was called
                 if self.args.example:
-                    # Make default dir the Example dir
+                    # If example was called
+                    # make default dir the Example dir
                     dataset.file.suggest(
-                        getattr(self.args, datasetname+'_file'),
+                        datasetfilename,
                         force_dir=self.args.example)
                 else:
                     dataset.file.suggest(
-                        getattr(self.args, datasetname+'_file'))
+                        datasetfilename)
                 # File has proper name, load files
                 if urlparse(dataset.file.suggested).netloc:
                     if not dataset.file.exist:
@@ -433,19 +455,25 @@ class DataSet:
                     dataset.type = 'pandas'
                 #
                 if dataset.autoload:
-                    dataset.load()
-                    if dataset.file.exist:
-                        self.verbose(1, 'Input file : {file}; {props}'.format(
-                            file=dataset.file.path,
-                            props=dataset.properties))
-                        # High verbosity, dump data.
-                        self.verbose(3, dataset.sheet)
+                    if getattr(self.args, datasetname+'_file') is None \
+                        and not dataset.file.necessary:
+                        # Don't load files which are not necessary and not
+                        # explictly called from command line.
+                        self.verbose(3, 'Not loading {file}.'.format(file=
+                            dataset.file.path))
+                    else:
+                        dataset.load()
+                        if dataset.file.exist:
+                            self.verbose(1, 'Input file : {file}; {props}'.format(
+                                file=dataset.file.path,
+                                props=dataset.properties))
+                            # High verbosity, dump data.
+                            self.verbose(3, dataset.sheet)
             else:
                 # How to initialize other files
-                dataset.file.suggest(getattr(self.args, datasetname+'_file'))
+                dataset.file.suggest(datasetfilename)
                 if dataset.file.extension == '.md':
                     dataset.type = 'markdown'
                 else:
                     dataset.type = 'pandas'
                 dataset.update_properties()
-
-- 
GitLab