Commit ad32c584 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Added input option for FoodComposition translation.

parent 4fe8b8bd
......@@ -16,6 +16,7 @@ def print_as_link(text):
# These are the files we work with
# Create list
dataset = mcra.DataSet(
opening='(c) 2020 Euromix, Biometris, WUR.',
description='Converts the EFSA Zendono Excel sheet into an MCRA '
+ 'conforming format, using some external translation files.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
......@@ -28,8 +29,7 @@ efsa_url = 'https://zenodo.org/record/1488653/files/' \
dataset.add(
name='efsa',
short_argument='-e',
help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. '
+ '(default: %(default)s)',
help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
checksum='f816bf3928431d54f9d15fb134cc9106',
default_name=efsa_url,
default_dir='Input',
......@@ -39,8 +39,8 @@ dataset.add(
dataset.add(
name='processing_type',
short_argument='-t',
help='The (input) processing type file - format: csv (Comma Seperated).'
+ '(default: %(default)s)',
help='The (input) processing type file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingTypes.csv',
default_dir='Input',
direction='Input')
......@@ -49,7 +49,7 @@ dataset.add(
name='processing_translation',
short_argument='-p',
help='The (input) processing translation file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)',
+ 'format: csv (Comma Seperated).',
default_name='ProcTypeTranslations.csv',
default_dir='Input',
direction='Input')
......@@ -58,7 +58,7 @@ dataset.add(
name='food_translation',
short_argument='-f',
help='The (input) food translation file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)',
+ 'format: csv (Comma Seperated).',
default_name='FoodTranslations.csv',
default_dir='Input',
direction='Input')
......@@ -67,18 +67,29 @@ dataset.add(
name='substance_translation',
short_argument='-s',
help='The (input) substance translation file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)',
+ 'format: tsv (Tab Seperated), file not required.',
default_name='SubstanceTranslations.tsv',
necessary=False,
default_dir='Input',
direction='Input')
#
dataset.add(
name='food_composition',
short_argument='-g',
help='The (input) food composition file - '
+ 'format: xlsx (Excel), file not required.',
default_name='FoodComposition.xlsx',
necessary=False,
default_dir='Input',
direction='Input',
autoload=False)
#
# The output files
dataset.add(
name='processing_factor',
short_argument='-o',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)',
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.zip',
# default_name='ProcessingFactors.csv',
default_dir='Output')
......@@ -110,6 +121,14 @@ dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
props=dataset.efsa.properties,
version=efsa_version))
#
# Use this file only if called explictly from command line
# and of course, it has to exist. The -g is enough to trigger the default
if dataset.args.food_composition_file is not None \
and dataset.food_composition.file.exist:
dataset.food_composition.load(sheet_name='FoodTranslation')
dataset.verbose(1, 'Input file : {file}; {props}'.format(
file=dataset.food_composition.file.path,
props=dataset.food_composition.properties))
#############################################################################
......@@ -206,6 +225,31 @@ efsa_combined.loc[
+ '-' + efsa_combined['idProcessingType'].astype(str)
#
if dataset.food_composition.sheet is not None:
# We also have to do the food_composition translation
# First remove all but keep the P-code data
# Also use shorter name:
fcs = dataset.food_composition.sheet[
dataset.food_composition.sheet['idToFood'].str.startswith('P')]
# Now split the first column
fs=pd.DataFrame()
# Bit of a mess, to combine again.
fs[['idFromFood-Left', 'idFromFood-Right']] = \
fcs['idFromFood'].str.rsplit('-', n=1, expand=True)
fcs=fcs.merge(fs, left_index=True, right_index=True)
# New columns is properly joined now
fcs['idToFood-PC'] = fcs.loc[:, ('idToFood', 'idFromFood-Right')].apply(
lambda x: '-'.join(x.dropna()), axis=1)
# Finally a left join to combine
efsa_combined = efsa_combined.merge(
# Left join with processing type sheet,
fcs,
left_on='idFoodProcessed', right_on='idToFood-PC',
how='left').assign()
efsa_combined.loc[
(efsa_combined['idToFood-PC'].notna()),
'idFoodProcessed'] = efsa_combined['idFromFood']
print(fcs)
#############################################################################
# Phase 3. Exporting the data.
# Seems obvious what to do here.
......
......@@ -39,8 +39,11 @@ These are the input and output files of the script. All names are defaults, and
* A csv file with a summary (and counts) of *the remaining data* of the EU sheet, called [Mismatches.csv](Mismatches.csv).
The following is happening in the script, essentially ([more details here](#detailed-workings))
* The script wil try to match the first column (``FromFC``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) to the column ``KeyFacets Code`` of the EU sheet. If a match is found, then the second column (``FCToProcType``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) will become the ``idProcessingType`` of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)).
* Then the script will try to match the ``FromFX`` column of [FoodTranslations.csv](FoodTranslations.csv) with the column ``Matrix FoodEx2 Code`` from the EU sheet, *for all rows that didn't already match in the previous step*. If a match was found, then the value of ``FXToProcType`` will be copied to ``idProcessingType`` of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)).
* The script wil try to match the first column (``FromFC``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) to the column ``KeyFacets Code`` of the EU sheet. If a match is found, then the second column (``FCToProcType``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) will become the field ``idProcessingType``.
* Then the script will try to match both the ``FromFX`` and ``FXToRpc`` column of [FoodTranslations.csv](FoodTranslations.csv) with the columns ``Matrix FoodEx2 Code`` and ``Matrix Code`` from the EU sheet, *for all rows that didn't already match in the previous step*. If a match was found, then the value of ``FXToProcType`` will be copied to ``idProcessingType``.
* If no substance file was given, then just copy the field ``ParamCode Active Substance`` to ``idSubstance``. But if a substance was given, then strip the dash from the ``'CASNumber`` column in the substance file, and match the column ``ParamCode Active Substance`` in the EFSA sheet to ``code`` in the substances sheet. If a match was found then copy the modified (without dash) ``CASNumber`` to ``idSubstance``.
* If a foodcompositions file was given, then an additional translation is done. This table needs to have the layout of the MCRA FoodComposition.
* of the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip))
* Finally the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)) will be written, together with some reports.
## Prerequisites
......@@ -50,10 +53,11 @@ In order to use the python script, the following libraries are necessary
* [xlrd](https://pypi.org/project/xlrd/)
* [tabulate](https://pypi.org/project/tabulate/)
* [openpyxl](https://pypi.org/project/openpyxl/)
* [requests](https://pypi.org/project/requests/)
Install all the libraries at once with
```
pip install pandas xlrd tabulate openpyxl
pip install pandas xlrd tabulate openpyxl requests
```
## Usage
......
......@@ -13,6 +13,7 @@ import requests
import hashlib
import numpy as np
import math
import sys
# For debugging purposes
# from objbrowser import browse
......@@ -24,6 +25,13 @@ class McraAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj
# To easily join two columns
def here_concat(*args):
strs = [str(arg) for arg in args if not pd.isnull(arg)]
return '-'.join(strs) if strs else np.nan
self.concat = np.vectorize(here_concat)
# To easily copy a bunch of columns
def copycolumn(self, columnnames):
for fromcol, tocol in columnnames.items():
......@@ -321,18 +329,23 @@ class DataSheet:
class DataSet:
def __init__(self, description=None, epilog=None):
def __init__(self, opening=None, description=None, epilog=None):
# The arguments object to use.
self.args = None
# Delay parsing help, to peek ahead at verbosity...
self.parser = ArgumentParser(
description=description, epilog=epilog)
# The verbosity argument will accept: -v, or -vv, -vvv etc.
# Set default to 1, so that basic output will always appear.
self.parser.add_argument(
'-v', '--verbosity', help="Show verbose output",
action="count", default=0)
action="count", default=1)
self.parser.add_argument(
'-x', '--example', action='store_const', const='Example',
help='Uses input files from the %(const)s subdir.')
# Look ahead to check whether verbosity is used.
if '-v' in sys.argv or '--verbosity' in sys.argv:
print(opening)
# The list of sheets
self.list = []
......@@ -357,20 +370,23 @@ class DataSet:
setattr(self, name, DataSheet(default_name=default_name,
default_dir=default_dir, direction=direction,
checksum=checksum, autoload=autoload,
necessary=necessary,**kwargs))
necessary=necessary, **kwargs))
# Add to list
self.list.append(name)
# Also set-up arguments if necessary
long_argument = '--'+name+'_file'
if type(help) == str and help is not SUPPRESS:
help = help + ' (default: {default})'.format(
default=default_name)
if short_argument is None:
self.parser.add_argument(
long_argument,
default=getattr(self, name).file.default_name,
long_argument, nargs='?',
const=getattr(self, name).file.default_name,
help=help)
else:
self.parser.add_argument(
short_argument, long_argument,
default=getattr(self, name).file.default_name,
short_argument, long_argument, nargs='?',
const=getattr(self, name).file.default_name,
help=help)
if checksum is not None:
self.parser.add_argument(
......@@ -395,16 +411,22 @@ class DataSet:
# Go through all files and set filenames
for datasetname in self.list:
dataset = getattr(self, datasetname)
if getattr(self.args, datasetname+'_file') is None:
# Argument was not used
datasetfilename = dataset.file.default_name
else:
datasetfilename = getattr(self.args, datasetname+'_file')
#
if dataset.direction == 'Input':
# If example was called
if self.args.example:
# Make default dir the Example dir
# If example was called
# make default dir the Example dir
dataset.file.suggest(
getattr(self.args, datasetname+'_file'),
datasetfilename,
force_dir=self.args.example)
else:
dataset.file.suggest(
getattr(self.args, datasetname+'_file'))
datasetfilename)
# File has proper name, load files
if urlparse(dataset.file.suggested).netloc:
if not dataset.file.exist:
......@@ -433,19 +455,25 @@ class DataSet:
dataset.type = 'pandas'
#
if dataset.autoload:
dataset.load()
if dataset.file.exist:
self.verbose(1, 'Input file : {file}; {props}'.format(
file=dataset.file.path,
props=dataset.properties))
# High verbosity, dump data.
self.verbose(3, dataset.sheet)
if getattr(self.args, datasetname+'_file') is None \
and not dataset.file.necessary:
# Don't load files which are not necessary and not
# explictly called from command line.
self.verbose(3, 'Not loading {file}.'.format(file=
dataset.file.path))
else:
dataset.load()
if dataset.file.exist:
self.verbose(1, 'Input file : {file}; {props}'.format(
file=dataset.file.path,
props=dataset.properties))
# High verbosity, dump data.
self.verbose(3, dataset.sheet)
else:
# How to initialize other files
dataset.file.suggest(getattr(self.args, datasetname+'_file'))
dataset.file.suggest(datasetfilename)
if dataset.file.extension == '.md':
dataset.type = 'markdown'
else:
dataset.type = 'pandas'
dataset.update_properties()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment