From ad32c5840dfe422d8ae9154215a22db5f5c5a8c2 Mon Sep 17 00:00:00 2001 From: Hans van den Heuvel <hans1.vandenheuvel@wur.nl> Date: Thu, 26 Mar 2020 14:52:45 +0100 Subject: [PATCH] Added input option for FoodComposition translation. --- .../Convert-EUProcessingFactorsDB.py | 60 +++++++++++++--- Convert-EUProcessingFactorsDB/README.md | 10 ++- Convert-EUProcessingFactorsDB/mcra.py | 68 +++++++++++++------ 3 files changed, 107 insertions(+), 31 deletions(-) diff --git a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py index e327ee6..7a45068 100644 --- a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py +++ b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py @@ -16,6 +16,7 @@ def print_as_link(text): # These are the files we work with # Create list dataset = mcra.DataSet( + opening='(c) 2020 Euromix, Biometris, WUR.', description='Converts the EFSA Zendono Excel sheet into an MCRA ' + 'conforming format, using some external translation files.', epilog='For example: use %(prog)s -v -x for a verbose example.') @@ -28,8 +29,7 @@ efsa_url = 'https://zenodo.org/record/1488653/files/' \ dataset.add( name='efsa', short_argument='-e', - help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ' - + '(default: %(default)s)', + help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ', checksum='f816bf3928431d54f9d15fb134cc9106', default_name=efsa_url, default_dir='Input', @@ -39,8 +39,8 @@ dataset.add( dataset.add( name='processing_type', short_argument='-t', - help='The (input) processing type file - format: csv (Comma Seperated).' - + '(default: %(default)s)', + help='The (input) processing type file - ' + + 'format: csv (Comma Seperated).', default_name='ProcessingTypes.csv', default_dir='Input', direction='Input') @@ -49,7 +49,7 @@ dataset.add( name='processing_translation', short_argument='-p', help='The (input) processing translation file - ' - + 'format: csv (Comma Seperated). (default: %(default)s)', + + 'format: csv (Comma Seperated).', default_name='ProcTypeTranslations.csv', default_dir='Input', direction='Input') @@ -58,7 +58,7 @@ dataset.add( name='food_translation', short_argument='-f', help='The (input) food translation file - ' - + 'format: csv (Comma Seperated). (default: %(default)s)', + + 'format: csv (Comma Seperated).', default_name='FoodTranslations.csv', default_dir='Input', direction='Input') @@ -67,18 +67,29 @@ dataset.add( name='substance_translation', short_argument='-s', help='The (input) substance translation file - ' - + 'format: csv (Comma Seperated). (default: %(default)s)', + + 'format: tsv (Tab Seperated), file not required.', default_name='SubstanceTranslations.tsv', necessary=False, default_dir='Input', direction='Input') # +dataset.add( + name='food_composition', + short_argument='-g', + help='The (input) food composition file - ' + + 'format: xlsx (Excel), file not required.', + default_name='FoodComposition.xlsx', + necessary=False, + default_dir='Input', + direction='Input', + autoload=False) +# # The output files dataset.add( name='processing_factor', short_argument='-o', help='The (output) processing factor file - ' - + 'format: csv (Comma Seperated). (default: %(default)s)', + + 'format: csv (Comma Seperated).', default_name='ProcessingFactors.zip', # default_name='ProcessingFactors.csv', default_dir='Output') @@ -110,6 +121,14 @@ dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format( props=dataset.efsa.properties, version=efsa_version)) # +# Use this file only if called explictly from command line +# and of course, it has to exist. The -g is enough to trigger the default +if dataset.args.food_composition_file is not None \ + and dataset.food_composition.file.exist: + dataset.food_composition.load(sheet_name='FoodTranslation') + dataset.verbose(1, 'Input file : {file}; {props}'.format( + file=dataset.food_composition.file.path, + props=dataset.food_composition.properties)) ############################################################################# @@ -206,6 +225,31 @@ efsa_combined.loc[ + '-' + efsa_combined['idProcessingType'].astype(str) # +if dataset.food_composition.sheet is not None: + # We also have to do the food_composition translation + # First remove all but keep the P-code data + # Also use shorter name: + fcs = dataset.food_composition.sheet[ + dataset.food_composition.sheet['idToFood'].str.startswith('P')] + # Now split the first column + fs=pd.DataFrame() + # Bit of a mess, to combine again. + fs[['idFromFood-Left', 'idFromFood-Right']] = \ + fcs['idFromFood'].str.rsplit('-', n=1, expand=True) + fcs=fcs.merge(fs, left_index=True, right_index=True) + # New columns is properly joined now + fcs['idToFood-PC'] = fcs.loc[:, ('idToFood', 'idFromFood-Right')].apply( + lambda x: '-'.join(x.dropna()), axis=1) + # Finally a left join to combine + efsa_combined = efsa_combined.merge( + # Left join with processing type sheet, + fcs, + left_on='idFoodProcessed', right_on='idToFood-PC', + how='left').assign() + efsa_combined.loc[ + (efsa_combined['idToFood-PC'].notna()), + 'idFoodProcessed'] = efsa_combined['idFromFood'] + print(fcs) ############################################################################# # Phase 3. Exporting the data. # Seems obvious what to do here. diff --git a/Convert-EUProcessingFactorsDB/README.md b/Convert-EUProcessingFactorsDB/README.md index 44c857a..6263274 100644 --- a/Convert-EUProcessingFactorsDB/README.md +++ b/Convert-EUProcessingFactorsDB/README.md @@ -39,8 +39,11 @@ These are the input and output files of the script. All names are defaults, and * A csv file with a summary (and counts) of *the remaining data* of the EU sheet, called [Mismatches.csv](Mismatches.csv). The following is happening in the script, essentially ([more details here](#detailed-workings)) -* The script wil try to match the first column (``FromFC``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) to the column ``KeyFacets Code`` of the EU sheet. If a match is found, then the second column (``FCToProcType``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) will become the ``idProcessingType`` of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)). -* Then the script will try to match the ``FromFX`` column of [FoodTranslations.csv](FoodTranslations.csv) with the column ``Matrix FoodEx2 Code`` from the EU sheet, *for all rows that didn't already match in the previous step*. If a match was found, then the value of ``FXToProcType`` will be copied to ``idProcessingType`` of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)). +* The script wil try to match the first column (``FromFC``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) to the column ``KeyFacets Code`` of the EU sheet. If a match is found, then the second column (``FCToProcType``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) will become the field ``idProcessingType``. +* Then the script will try to match both the ``FromFX`` and ``FXToRpc`` column of [FoodTranslations.csv](FoodTranslations.csv) with the columns ``Matrix FoodEx2 Code`` and ``Matrix Code`` from the EU sheet, *for all rows that didn't already match in the previous step*. If a match was found, then the value of ``FXToProcType`` will be copied to ``idProcessingType``. +* If no substance file was given, then just copy the field ``ParamCode Active Substance`` to ``idSubstance``. But if a substance was given, then strip the dash from the ``'CASNumber`` column in the substance file, and match the column ``ParamCode Active Substance`` in the EFSA sheet to ``code`` in the substances sheet. If a match was found then copy the modified (without dash) ``CASNumber`` to ``idSubstance``. +* If a foodcompositions file was given, then an additional translation is done. This table needs to have the layout of the MCRA FoodComposition. +* of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)) * Finally the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)) will be written, together with some reports. ## Prerequisites @@ -50,10 +53,11 @@ In order to use the python script, the following libraries are necessary * [xlrd](https://pypi.org/project/xlrd/) * [tabulate](https://pypi.org/project/tabulate/) * [openpyxl](https://pypi.org/project/openpyxl/) + * [requests](https://pypi.org/project/requests/) Install all the libraries at once with ``` -pip install pandas xlrd tabulate openpyxl +pip install pandas xlrd tabulate openpyxl requests ``` ## Usage diff --git a/Convert-EUProcessingFactorsDB/mcra.py b/Convert-EUProcessingFactorsDB/mcra.py index 2a73b6d..f674029 100644 --- a/Convert-EUProcessingFactorsDB/mcra.py +++ b/Convert-EUProcessingFactorsDB/mcra.py @@ -13,6 +13,7 @@ import requests import hashlib import numpy as np import math +import sys # For debugging purposes # from objbrowser import browse @@ -24,6 +25,13 @@ class McraAccessor: def __init__(self, pandas_obj): self._obj = pandas_obj + # To easily join two columns + def here_concat(*args): + strs = [str(arg) for arg in args if not pd.isnull(arg)] + return '-'.join(strs) if strs else np.nan + + self.concat = np.vectorize(here_concat) + # To easily copy a bunch of columns def copycolumn(self, columnnames): for fromcol, tocol in columnnames.items(): @@ -321,18 +329,23 @@ class DataSheet: class DataSet: - def __init__(self, description=None, epilog=None): + def __init__(self, opening=None, description=None, epilog=None): # The arguments object to use. self.args = None + # Delay parsing help, to peek ahead at verbosity... self.parser = ArgumentParser( description=description, epilog=epilog) # The verbosity argument will accept: -v, or -vv, -vvv etc. + # Set default to 1, so that basic output will always appear. self.parser.add_argument( '-v', '--verbosity', help="Show verbose output", - action="count", default=0) + action="count", default=1) self.parser.add_argument( '-x', '--example', action='store_const', const='Example', help='Uses input files from the %(const)s subdir.') + # Look ahead to check whether verbosity is used. + if '-v' in sys.argv or '--verbosity' in sys.argv: + print(opening) # The list of sheets self.list = [] @@ -357,20 +370,23 @@ class DataSet: setattr(self, name, DataSheet(default_name=default_name, default_dir=default_dir, direction=direction, checksum=checksum, autoload=autoload, - necessary=necessary,**kwargs)) + necessary=necessary, **kwargs)) # Add to list self.list.append(name) # Also set-up arguments if necessary long_argument = '--'+name+'_file' + if type(help) == str and help is not SUPPRESS: + help = help + ' (default: {default})'.format( + default=default_name) if short_argument is None: self.parser.add_argument( - long_argument, - default=getattr(self, name).file.default_name, + long_argument, nargs='?', + const=getattr(self, name).file.default_name, help=help) else: self.parser.add_argument( - short_argument, long_argument, - default=getattr(self, name).file.default_name, + short_argument, long_argument, nargs='?', + const=getattr(self, name).file.default_name, help=help) if checksum is not None: self.parser.add_argument( @@ -395,16 +411,22 @@ class DataSet: # Go through all files and set filenames for datasetname in self.list: dataset = getattr(self, datasetname) + if getattr(self.args, datasetname+'_file') is None: + # Argument was not used + datasetfilename = dataset.file.default_name + else: + datasetfilename = getattr(self.args, datasetname+'_file') + # if dataset.direction == 'Input': - # If example was called if self.args.example: - # Make default dir the Example dir + # If example was called + # make default dir the Example dir dataset.file.suggest( - getattr(self.args, datasetname+'_file'), + datasetfilename, force_dir=self.args.example) else: dataset.file.suggest( - getattr(self.args, datasetname+'_file')) + datasetfilename) # File has proper name, load files if urlparse(dataset.file.suggested).netloc: if not dataset.file.exist: @@ -433,19 +455,25 @@ class DataSet: dataset.type = 'pandas' # if dataset.autoload: - dataset.load() - if dataset.file.exist: - self.verbose(1, 'Input file : {file}; {props}'.format( - file=dataset.file.path, - props=dataset.properties)) - # High verbosity, dump data. - self.verbose(3, dataset.sheet) + if getattr(self.args, datasetname+'_file') is None \ + and not dataset.file.necessary: + # Don't load files which are not necessary and not + # explictly called from command line. + self.verbose(3, 'Not loading {file}.'.format(file= + dataset.file.path)) + else: + dataset.load() + if dataset.file.exist: + self.verbose(1, 'Input file : {file}; {props}'.format( + file=dataset.file.path, + props=dataset.properties)) + # High verbosity, dump data. + self.verbose(3, dataset.sheet) else: # How to initialize other files - dataset.file.suggest(getattr(self.args, datasetname+'_file')) + dataset.file.suggest(datasetfilename) if dataset.file.extension == '.md': dataset.type = 'markdown' else: dataset.type = 'pandas' dataset.update_properties() - -- GitLab