Commit e8219d11 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Merged two scripts into one. Issue 1087.

parent 8d082b20
#!/usr/bin/python
__version_info__ = ('1', '0', '4')
__version_info__ = ('1', '1', '0')
__version__ = '.'.join(__version_info__)
#############################################################################
......@@ -8,6 +8,7 @@ __version__ = '.'.join(__version_info__)
# Doing stuff like parsing arguments, and reading the files.
#
from dataconversion import DataSet, PY_INDENT, thisyear
from datetime import datetime
import pandas as pd
import textwrap
......@@ -53,6 +54,46 @@ dataset.add(
inzip=True, # Copy this file into the zip
direction='Input')
#
dataset.add(
name='processing_type',
short_argument='-t',
help='The (input) processing type file - '
+ 'format: csv (Comma Seperated).',
default_name=None,
necessary=False,
default_dir='Input',
direction='Input')
#
dataset.add(
name='processing_translation',
short_argument='-q',
help='The (input) processing translation file - '
+ 'format: csv (Comma Seperated).',
necessary=False,
default_name=None,
default_dir='Input',
direction='Input')
#
dataset.add(
name='food_translation',
short_argument='-f',
help='The (input) food translation file - '
+ 'format: csv (Comma Seperated).',
default_name=None,
necessary=False,
default_dir='Input',
direction='Input')
#
dataset.add(
name='food_composition',
short_argument='-g',
help='The (input) food composition file - '
+ 'format: xlsx (Excel), file not required.',
default_name=None,
necessary=False,
default_dir='Input',
direction='Input')
#
# The output files
# The (main) processing factors table
......@@ -63,11 +104,18 @@ dataset.add(
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.csv',
default_dir='Output')
#
dataset.add(
name='mismatches',
short_argument='-m',
default_name=None,
necessary=False,
inzip=True,
default_dir='Output')
# References
dataset.add(
name='references',
short_argument='-f',
short_argument='-b',
help='The (output) references file - '
+ 'format: csv (Comma Seperated).',
default_name='References.csv',
......@@ -87,10 +135,7 @@ efsa_version = pd.read_excel(
nrows=1, header=None).iloc[0, 0]
dataset.efsa.load(sheet_name=efsa_sheet, header=4)
dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
file=dataset.efsa.file.path,
props=dataset.efsa.properties,
version=efsa_version))
dataset.verbose(1, f'Input file : {dataset.efsa.file.path}; {efsa_version}; {dataset.efsa.properties}')
#
# Also reading the ProcStudies Evaluation; using panda directly
# Ok here, because it comes from same file, although not preferred
......@@ -108,7 +153,6 @@ efsa = dataset.efsa.sheet
efsa.mcra.copycolumn({
'Matrix Code': 'idFoodUnProcessed',
'Raw Primary Commodity': 'FoodUnprocessedName',
'KeyFacets Code': 'idProcessingType',
'KeyFacets Interpreted': 'ProcessingName',
'Matrix FoodEx2 Code': 'idFoodProcessed',
'Matrix Code Interpreted': 'FoodProcessedName',
......@@ -117,12 +161,43 @@ efsa.mcra.copycolumn({
'Median PF': 'Nominal'
})
# See whether we can do something with Substance translation
if not dataset.exists('substance_translation'):
# No substance translation? Just copy column
efsa.mcra.copycolumn(
{'ParamCode Active Substance': 'idSubstance'})
#
if dataset.exists('processing_translation') and dataset.exists('food_translation'):
dataset.verbose(3, f'Using sheet: Processing Translation.')
dataset.verbose(3, f'Using sheet: Food Translation.')
efsa = dataset.efsa.sheet.merge(
# Left join on all the rows from the EFSA sheet
# that have a Keyfacets Code in dataset.processing_translation.sheet
dataset.processing_translation.sheet, left_on='KeyFacets Code',
right_on='FromFC', how='left').merge(
# Left join with both FoodEx2 and Matrix code
# on the food_translation file
dataset.food_translation.sheet,
left_on=['Matrix FoodEx2 Code', 'Matrix Code'],
right_on=['FromFX', 'FXToRpc'], how='left').assign(
)
# idProcessingType
#
# If 'FCToProcType' contains a value, then make a new field
# 'idProcessingType', with the value from 'FCToProcType'
efsa.loc[
(efsa['FCToProcType'].notna()),
'idProcessingType'] = efsa['FCToProcType']
# If 'FCToProcType' does not contain a value and 'FXToProcType' does
# then make a new field, 'idProcessingType'
# with the value from 'FXToProcType'
efsa.loc[
(efsa['FCToProcType'].isna()
& efsa['FXToProcType'].notna()),
'idProcessingType'] = efsa['FXToProcType']
#
else:
# Just copy the column
efsa.mcra.copycolumn({'KeyFacets Code': 'idProcessingType'})
# See whether we can do something with Substance translation
if dataset.exists('substance_translation'):
dataset.verbose(3, f'Using sheet: Substance Translation.')
if 'CASNumber' in dataset.substance_translation.sheet.columns:
# This is the "old" situation
# Strip dash (-) from the CASNumber column
......@@ -146,7 +221,56 @@ else:
how='left').assign()
# Copy ToCode to idSubstance column
efsa.mcra.copycolumn({'ToCode': 'idSubstance'})
else:
# No substance translation? Just copy column
efsa.mcra.copycolumn(
{'ParamCode Active Substance': 'idSubstance'})
#
# Use the description of Processing Type
if dataset.exists('processing_type'):
dataset.verbose(3, f'Using sheet: Processing Type.')
efsa = efsa.merge(
# Left join with processing type sheet,
dataset.processing_type.sheet,
left_on='idProcessingType', right_on='idProcessingType',
how='left').assign()
# Copy column
efsa.mcra.copycolumn(
{'Description': 'MCRA_ProcessingType_Description'})
# idFoodProcessed
# Just concat idFoodUnProcessed with idProcessingType with a dash
efsa.loc[
(efsa['idProcessingType'].notna()),
'idFoodProcessed'] = efsa['idFoodUnProcessed'].astype(str) \
+ '-' + efsa['idProcessingType'].astype(str)
else:
efsa.mcra.addcolumn({'Description'})
efsa.mcra.copycolumn({'Matrix FoodEx2 Code': 'idFoodProcessed'})
if dataset.exists('food_composition'):
dataset.verbose(3, f'Using sheet: Food Composition.')
# We also have to do the food_composition translation
# First remove all but keep the P-code data
# Also use shorter name:
fcs = dataset.food_composition.sheet[(
dataset.food_composition.sheet['idToFood'].str.startswith('P') &
dataset.food_composition.sheet['idFromFood'].str.contains('-'))]
fcs = fcs.mcra.splitjoin(
name='idToFood-PC', split='idFromFood', join='idToFood')
# Then a left join to combine
efsa = efsa.merge(
# Left join with processing type sheet,
fcs,
left_on='idFoodProcessed', right_on='idToFood-PC',
how='left').assign()
efsa.loc[
(efsa['idToFood-PC'].notna() &
efsa['idFoodProcessed'].str.contains('-')),
'idFoodProcessed'] = efsa['idFromFood']
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa.mcra.addcolumn({'Upper',
......@@ -177,16 +301,171 @@ header = [
'Nominal', 'Upper', 'NominalUncertaintyUpper',
'UpperUncertaintyUpper',
'Study Reference']
dataset.processing_factor.sheet = efsa[
(efsa["idProcessingType"] != "-") &
efsa['idSubstance'].notna()
][header]
if dataset.exists('processing_translation') and dataset.exists('food_translation'):
header = ['idProcessingType', 'idSubstance', 'SubstanceName',
'idFoodProcessed', 'idFoodUnProcessed', 'FoodUnprocessedName',
'Nominal', 'Upper', 'NominalUncertaintyUpper',
'UpperUncertaintyUpper', 'KeyFacets Interpreted',
'Matrix Code Interpreted', 'MCRA_ProcessingType_Description',
'Study Reference']
dataset.processing_factor.sheet = efsa[
(efsa['FCToProcType'].notna() |
efsa['FXToProcType'].notna()) &
efsa['idSubstance'].notna()][header]
else:
header = [
'idFoodProcessed', 'Matrix Code Interpreted',
'idFoodUnProcessed', 'Matrix Code', 'RPC Code', 'FoodUnprocessedName',
'idProcessingType', 'KeyFacets Code', 'KeyFacets Interpreted',
'idSubstance', 'ParamCode Active Substance', 'SubstanceName',
'Nominal', 'Upper', 'NominalUncertaintyUpper',
'UpperUncertaintyUpper',
'Study Reference']
dataset.processing_factor.sheet = efsa[
(efsa["idProcessingType"] != "-") &
efsa['idSubstance'].notna()
][header]
#############################################################################
# Phase 3. Report about the data.
# This is the way to go if you want to know if a sheet exists...
# if dataset.exists('substance_translation'):
# print('Yahoo')
# Here's an auto generated report
if dataset.exists('mismatches', sheet_too=False):
dataset.verbose(3, f'Build sheet: Mismatches.')
# Let's create a sheet with an overview over the codes.
# This is the sheet, only the ones lacking 'idProcessingType' (not exported)
mismatch_table = efsa.loc[(efsa['idProcessingType'].isna())]
report_sheet = mismatch_table.groupby(
['KeyFacets Code', 'Matrix FoodEx2 Code']).size().reset_index(
name='Size').merge(
mismatch_table[['KeyFacets Code', 'Matrix FoodEx2 Code']].groupby(
'KeyFacets Code').size().reset_index(name='Size'),
left_on='KeyFacets Code', right_on='KeyFacets Code', how='left'
).assign()
report_sheet.rename(
columns={'Size_y': 'Number of KeyFacets Codes',
'Size_x': 'Number of Matrix FoodEx2 Codes'}, inplace=True)
report_sheet.sort_values(
by=['Number of KeyFacets Codes', 'KeyFacets Code',
'Number of Matrix FoodEx2 Codes', 'Matrix FoodEx2 Code'],
ascending=False, inplace=True)
report_sheet = report_sheet.merge(
mismatch_table[
['KeyFacets Code', 'KeyFacets Interpreted']].drop_duplicates(),
left_on='KeyFacets Code', right_on='KeyFacets Code', how='left'
).assign()
report_sheet = report_sheet.merge(
mismatch_table[
['Matrix FoodEx2 Code', 'Matrix Code Interpreted',
'Matrix Code']].drop_duplicates(),
left_on='Matrix FoodEx2 Code', right_on='Matrix FoodEx2 Code', how='left'
).assign()
# print(report_sheet)
min_nr_of_mismatches = int(5)
mismatch_table_string = report_sheet[
(report_sheet['Number of KeyFacets Codes'] >= min_nr_of_mismatches) &
(report_sheet['Number of Matrix FoodEx2 Codes'] >= min_nr_of_mismatches)
].to_markdown(index=False)
# Dump the mismatch file
# We want a specific order in the columns:
header = ['Matrix FoodEx2 Code', 'Matrix Code Interpreted', 'Matrix Code',
'KeyFacets Code', 'KeyFacets Interpreted',
'Number of Matrix FoodEx2 Codes', 'Number of KeyFacets Codes']
dataset.mismatches.sheet = report_sheet[header]
# We also need some further text reporting:
# Let's make a new column of the combination
# of 'idSubstance' and 'idFoodUnProcessed'
mismatch_table = efsa[
(efsa['FCToProcType'].notna() |
efsa['FXToProcType'].notna()) &
efsa['idSubstance'].notna()]
mismatch_table = mismatch_table.mcra.join(
name='idSubstanceFoodProc',
join_left='idSubstance',
join_right='idFoodUnProcessed')
double_types = mismatch_table.groupby(
['idProcessingType', 'idSubstanceFoodProc'],
as_index=False).agg(
{'idSubstance': 'first',
'idFoodUnProcessed': 'first',
'FoodUnprocessedName': 'first',
'KeyFacets Interpreted': 'first',
'Matrix Code Interpreted': 'first',
'MCRA_ProcessingType_Description': 'first'}).drop(
'idSubstanceFoodProc', axis=1)
#
#
dataset.mismatches.report = r'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
Conversion run details
======================
* Date: '''+datetime.now().strftime('%H:%M:%S, %d %b %Y')+r'''
* Files:
'''
for data in dataset:
if data.direction == 'Input':
dataset.mismatches.report = textwrap.indent(data.report, PY_INDENT)
for datasetname in dataset.list:
# Bit of a hack, figure out later how this can be properly done.
if getattr(dataset, datasetname).direction == 'Output' \
and datasetname != 'report':
dataset.mismatches.report = textwrap.indent(
getattr(dataset, datasetname).report, PY_INDENT)
dataset.mismatches.report = r'''
EFSA Excel input details
========================
* Excel input: ['''+dataset.efsa.file.path+r''']('''+dataset.efsa.file.path+r''')
* '''+efsa_version+r'''
* '''+dataset.efsa.properties+r'''
* Modified: '''+dataset.efsa.file.modified+r'''
* Processing type translation
* Number of rows matched '''+str(
len(efsa.loc[efsa['FCToProcType'].notna()].index))+r'''
* Food Translation
* Number of rows matched '''+str(
len(efsa.loc[efsa['FXToProcType'].notna()].index))+r'''
* All translations
* Number of rows matched '''+str(
len(efsa.loc[
efsa['idProcessingType'].notna()
].index))+r'''
EFSA Excel Merge failures
=========================
* Number input rows '''+str(len(efsa.index))+r'''
* Translations matched '''+str(
len(efsa.loc[
efsa['idProcessingType'].notna()
].index))+r'''
* Translations not matched '''+str(len(mismatch_table.index))+r'''
* Unique KeyFacets codes '''+str(
mismatch_table['KeyFacets Code'].nunique())+r'''
* Unique FoodEx2 codes '''+str(
mismatch_table['Matrix FoodEx2 Code'].nunique())+r'''
Below a list with the most (more than '''+str(
min_nr_of_mismatches)+r''') mismatches.
'''+mismatch_table_string+r'''
Substance conversion duplicates
===============================
'''+double_types.to_markdown(index=False)+r'''
'''
dataset.close()
#!/usr/bin/python
__version_info__ = ('1', '0', '0')
__version__ = '.'.join(__version_info__)
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
from dataconversion import DataSet, PY_INDENT, thisyear
import pandas as pd
from datetime import datetime
import textwrap
import os
# Small utility to create hyperlink to hyperlink :-)
def print_as_link(text):
return '[{text}]({text})'.format(text=text)
# These are the files we work with
# Create list
dataset = DataSet(
opening='(c) ' + thisyear
+ ' Biometris, Wageningen University and Research.',
description='Converts the EFSA Zendono Excel sheet into an MCRA '
+ 'conforming format, using some external translation files.',
epilog='For example: use %(prog)s -v -x for a verbose example.',
version=__version__)
#
#
efsa_url = 'https://zenodo.org/record/1488653/files/' \
+ 'EU_Processing_Factors_db_P.xlsx.xlsx?download=1'
#
# The input files
dataset.add(
name='efsa',
short_argument='-e',
help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
checksum='f816bf3928431d54f9d15fb134cc9106',
default_name=efsa_url,
default_dir='Input',
direction='Input',
autoload=False)
#
dataset.add(
name='processing_type',
short_argument='-t',
help='The (input) processing type file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingTypes.csv',
default_dir='Input',
direction='Input')
#
dataset.add(
name='processing_translation',
short_argument='-p',
help='The (input) processing translation file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcTypeTranslations.csv',
default_dir='Input',
direction='Input')
#
dataset.add(
name='food_translation',
short_argument='-f',
help='The (input) food translation file - '
+ 'format: csv (Comma Seperated).',
default_name='FoodTranslations.csv',
default_dir='Input',
direction='Input')
#
dataset.add(
name='substance_translation',
short_argument='-s',
help='The (input) substance translation file - '
+ 'format: tsv (Tab Seperated), file not required.',
default_name='SubstanceTranslations.csv',
necessary=False,
default_dir='Input',
direction='Input')
#
dataset.add(
name='food_composition',
short_argument='-g',
help='The (input) food composition file - '
+ 'format: xlsx (Excel), file not required.',
default_name='FoodCompositions.xlsx',
necessary=False,
default_dir='Input',
direction='Input')
#
# The output files
dataset.add(
name='processing_factor',
short_argument='-o',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.csv',
default_dir='Output')
#
dataset.add(
name='mismatches',
default_name='Mismatches.xlsx',
default_dir='Output')
#
dataset.add(
name='references',
default_name='References.csv',
default_dir='Output')
#
#############################################################################
dataset.init()
# Manually load the EFSA sheet, because the data is in a non-trivial place
efsa_sheet = 2
efsa_version = pd.read_excel(
dataset.efsa.file.path, sheet_name=efsa_sheet,
nrows=1, header=None).iloc[0, 0]
dataset.efsa.load(sheet_name=efsa_sheet, header=4)
dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
file=dataset.efsa.file.path,
props=dataset.efsa.properties,
version=efsa_version))
#
# Also reading the ProcStudies Evaluation
efsa_procstudies = pd.read_excel(
dataset.efsa.file.path, sheet_name=1)
# ... and the References
dataset.references.sheet = pd.read_excel(
dataset.efsa.file.path, sheet_name=3)
#############################################################################
# Phase 2. Processing the data.
# Try to think SQL-wise or vector-wise about the data,
# but object-wise about the code
# We use the input Excel sheet as an SQL table, and supplement it
# until we have additional columns with all necessary data.
#
#
# Here we'll left join with both tables to supplement the original sheet.
# Then we have all data in one single dataframe (table).
efsa_combined = dataset.efsa.sheet.merge(
# Left join on all the rows from the EFSA sheet
# that have a Keyfacets Code in dataset.processing_translation.sheet
dataset.processing_translation.sheet, left_on='KeyFacets Code',
right_on='FromFC', how='left').merge(
# Left join with both FoodEx2 and Matrix code
# on the food_translation file
dataset.food_translation.sheet,
left_on=['Matrix FoodEx2 Code', 'Matrix Code'],
right_on=['FromFX', 'FXToRpc'], how='left').assign(
)
# First let's copy the columns which we want in the output unaltered so far
efsa_combined.mcra.copycolumn({'ParamName Active Substance': 'SubstanceName',
'Matrix Code': 'idFoodUnProcessed',
'Raw Primary Commodity': 'FoodUnprocessedName',
'Median PF': 'Nominal'})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa_combined.mcra.addcolumn({'Upper',
'NominalUncertaintyUpper',
'UpperUncertaintyUpper'})
#
# Now let's work on creating the rest of the fields (the hard labour)
#
# idProcessingType
#
# If 'FCToProcType' contains a value, then make a new field
# 'idProcessingType', with the value from 'FCToProcType'
efsa_combined.loc[
(efsa_combined['FCToProcType'].notna()),
'idProcessingType'] = efsa_combined['FCToProcType']
# If 'FCToProcType' does not contain a value and 'FXToProcType' does
# then make a new field, 'idProcessingType'
# with the value from 'FXToProcType'
efsa_combined.loc[
(efsa_combined['FCToProcType'].isna()
& efsa_combined['FXToProcType'].notna()),
'idProcessingType'] = efsa_combined['FXToProcType']
#
# See whether we can do something with FoodSubstances
if dataset.substance_translation.sheet is None:
# No substance translation? Just copy column
efsa_combined.mcra.copycolumn(
{'ParamCode Active Substance': 'idSubstance'})
else:
# Strip dash (-) from the CASNumber column
dataset.substance_translation.sheet['CASNumber'].replace(
'-', '', regex=True, inplace=True)
# Do a left join
efsa_combined = efsa_combined.merge(
# Left join with processing type sheet,
dataset.substance_translation.sheet,