Skip to content
Snippets Groups Projects
Commit a7d83e87 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Substance translation incorporated.

parent 18f1e656
No related branches found
No related tags found
No related merge requests found
...@@ -17,7 +17,7 @@ def print_as_link(text): ...@@ -17,7 +17,7 @@ def print_as_link(text):
dataset = mcra.DataSet( dataset = mcra.DataSet(
description='Converts the EFSA Zendono Excel sheet into an MCRA ' description='Converts the EFSA Zendono Excel sheet into an MCRA '
+ 'conforming format, using some external translation files.', + 'conforming format, using some external translation files.',
epilog='For example: use %(prog)s -v for verbose output.') epilog='For example: use %(prog)s -v -x for a verbose example.')
# #
# #
efsa_url = 'https://zenodo.org/record/1488653/files/' \ efsa_url = 'https://zenodo.org/record/1488653/files/' \
...@@ -62,13 +62,24 @@ dataset.add( ...@@ -62,13 +62,24 @@ dataset.add(
default_dir='Input', default_dir='Input',
direction='Input') direction='Input')
# #
dataset.add(
name='substance_translation',
short_argument='-s',
help='The (input) substance translation file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)',
default_name='SubstanceTranslations.tsv',
necessary = False,
default_dir='Input',
direction='Input')
#
# The output files # The output files
dataset.add( dataset.add(
name='processing_factor', name='processing_factor',
short_argument='-o', short_argument='-o',
help='The (output) processing factor file - ' help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)', + 'format: csv (Comma Seperated). (default: %(default)s)',
default_name='ProcessingFactors.zip', # default_name='ProcessingFactors.zip',
default_name='ProcessingFactors.csv',
default_dir='Output') default_dir='Output')
# #
dataset.add( dataset.add(
...@@ -124,8 +135,7 @@ efsa_combined = dataset.efsa.sheet.merge( ...@@ -124,8 +135,7 @@ efsa_combined = dataset.efsa.sheet.merge(
# First let's copy the columns which we want in the output unaltered # First let's copy the columns which we want in the output unaltered
# So this is a copy FROM FIELD : TO FIELD # So this is a copy FROM FIELD : TO FIELD
efsa_combined.mcra.copycolumn({'ParamCode Active Substance': 'idSubstance', efsa_combined.mcra.copycolumn({'ParamName Active Substance': 'SubstanceName',
'ParamName Active Substance': 'SubstanceName',
'Matrix Code': 'idFoodUnProcessed', 'Matrix Code': 'idFoodUnProcessed',
'Raw Primary Commodity': 'FoodUnprocessedName', 'Raw Primary Commodity': 'FoodUnprocessedName',
'Median PF': 'Nominal'}) 'Median PF': 'Nominal'})
...@@ -154,6 +164,26 @@ efsa_combined.loc[ ...@@ -154,6 +164,26 @@ efsa_combined.loc[
'idProcessingType'] = efsa_combined['FXToProcType'] 'idProcessingType'] = efsa_combined['FXToProcType']
# #
# See whether we can do something with FoodSubstances
if dataset.substance_translation.sheet is None:
# No substance translation? Just copy column
efsa_combined.mcra.copycolumn(
{'ParamCode Active Substance': 'idSubstance'})
else:
# Strip dash (-) from the CASNumber column
dataset.substance_translation.sheet['CASNumber'].replace(
'-', '', regex=True, inplace=True)
# Do a left join
efsa_combined = efsa_combined.merge(
# Left join with processing type sheet,
dataset.substance_translation.sheet,
left_on='ParamCode Active Substance', right_on='code',
how='left').assign()
# Copy CASNumber to idSubstance column
efsa_combined.mcra.copycolumn(
{'CASNumber': 'idSubstance'})
############################################### ###############################################
# Request by Waldo, please also add the description of the Processing Type. # Request by Waldo, please also add the description of the Processing Type.
# So, again, a left join :-) # So, again, a left join :-)
...@@ -185,8 +215,9 @@ header = ['idProcessingType', 'idSubstance', 'SubstanceName', ...@@ -185,8 +215,9 @@ header = ['idProcessingType', 'idSubstance', 'SubstanceName',
'UpperUncertaintyUpper', 'KeyFacets Interpreted', 'UpperUncertaintyUpper', 'KeyFacets Interpreted',
'Matrix Code Interpreted', 'MCRA_ProcessingType_Description'] 'Matrix Code Interpreted', 'MCRA_ProcessingType_Description']
dataset.processing_factor.sheet = efsa_combined[ dataset.processing_factor.sheet = efsa_combined[
efsa_combined['FCToProcType'].notna() | (efsa_combined['FCToProcType'].notna() |
efsa_combined['FXToProcType'].notna()][header] efsa_combined['FXToProcType'].notna()) &
efsa_combined['idSubstance'].notna()][header]
# #
# Writing output file # Writing output file
dataset.processing_factor.save() dataset.processing_factor.save()
...@@ -334,13 +365,17 @@ dataset.verbose( ...@@ -334,13 +365,17 @@ dataset.verbose(
if dataset.processing_factor.file.extension == '.zip': if dataset.processing_factor.file.extension == '.zip':
# Now, let's sneak in the report before we save # Now, let's sneak in the report before we save
dataset.processing_factor.add_file(dataset.report.file.path, 'README.md') filename_in_zip = 'README.md'
dataset.processing_factor.add_file(
dataset.report.file.path, filename_in_zip)
dataset.processing_factor.close() dataset.processing_factor.close()
dataset.verbose( dataset.verbose(
1, 'Output file: {file}; {props} {report} enclosed in zipfile.'.format( 1,
file=dataset.processing_factor.file.path, 'Output file: {f}; {p} {r} enclosed in zipfile as {zf}.'.format(
props=dataset.processing_factor.properties, f=dataset.processing_factor.file.path,
report=dataset.report.file.path)) p=dataset.processing_factor.properties,
r=dataset.report.file.path,
zf=filename_in_zip))
else: else:
dataset.processing_factor.close() dataset.processing_factor.close()
dataset.verbose( dataset.verbose(
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
...@@ -39,7 +39,8 @@ class McraAccessor: ...@@ -39,7 +39,8 @@ class McraAccessor:
# A class to work with the files more streamlined. # A class to work with the files more streamlined.
# Contains technical details just to use the files in a simple manner. # Contains technical details just to use the files in a simple manner.
class DataFile: class DataFile:
def __init__(self, default_name, default_dir, checksum=None): def __init__(self, default_name, default_dir, checksum=None,
necessary=True):
self.default_name = default_name self.default_name = default_name
self.default_base = os.path.splitext(self.default_name)[0] self.default_base = os.path.splitext(self.default_name)[0]
self.default_dir = default_dir self.default_dir = default_dir
...@@ -54,6 +55,7 @@ class DataFile: ...@@ -54,6 +55,7 @@ class DataFile:
self.hash = '' self.hash = ''
self.hash_short = '' self.hash_short = ''
self.checksum = checksum self.checksum = checksum
self.necessary = necessary
def update(self): def update(self):
# Updates file properties, e.g. for output files. # Updates file properties, e.g. for output files.
...@@ -141,9 +143,11 @@ class DataFile: ...@@ -141,9 +143,11 @@ class DataFile:
class DataSheet: class DataSheet:
# This is just a container for file properties and the pandas sheet. # This is just a container for file properties and the pandas sheet.
def __init__(self, default_name, default_dir, def __init__(self, default_name, default_dir,
checksum=None, direction='Output', autoload=True): checksum=None, direction='Output', autoload=True,
necessary=True):
self.file = DataFile( self.file = DataFile(
default_name=default_name, default_dir=default_dir) default_name=default_name, default_dir=default_dir,
necessary=necessary)
self.sheet = None self.sheet = None
self.type = None self.type = None
self.checksum = checksum self.checksum = checksum
...@@ -153,6 +157,7 @@ class DataSheet: ...@@ -153,6 +157,7 @@ class DataSheet:
self.direction = direction self.direction = direction
self.autoload = autoload self.autoload = autoload
def update_properties(self): def update_properties(self):
# Return some statistics about the dataframe and file as a string # Return some statistics about the dataframe and file as a string
if self.type == 'pandas': if self.type == 'pandas':
...@@ -273,6 +278,7 @@ class DataSheet: ...@@ -273,6 +278,7 @@ class DataSheet:
shutil.rmtree(self.file.zippath) shutil.rmtree(self.file.zippath)
self.file.zippath = None self.file.zippath = None
self.file.update() self.file.update()
self.update_properties()
def add_file(self, path, to=None): def add_file(self, path, to=None):
# Adds file to zip dir. # Adds file to zip dir.
...@@ -318,12 +324,13 @@ class DataSet: ...@@ -318,12 +324,13 @@ class DataSet:
# Container for all the files # Container for all the files
def add(self, name, default_name, default_dir, short_argument=None, def add(self, name, default_name, default_dir, short_argument=None,
checksum=None, help=SUPPRESS, direction='Output', autoload=True, checksum=None, help=SUPPRESS, direction='Output', autoload=True,
**kwargs): necessary=True, **kwargs):
if getattr(self, name, None) is None: if getattr(self, name, None) is None:
# Create a new sheet with a different name # Create a new sheet with a different name
setattr(self, name, DataSheet(default_name=default_name, setattr(self, name, DataSheet(default_name=default_name,
default_dir=default_dir, direction=direction, default_dir=default_dir, direction=direction,
checksum=checksum, autoload=autoload, **kwargs)) checksum=checksum, autoload=autoload,
necessary=necessary,**kwargs))
# Add to list # Add to list
self.list.append(name) self.list.append(name)
# Also set-up arguments if necessary # Also set-up arguments if necessary
...@@ -389,8 +396,9 @@ class DataSet: ...@@ -389,8 +396,9 @@ class DataSet:
print('File {file} has improper checksum'.format( print('File {file} has improper checksum'.format(
file=dataset.file.path)) file=dataset.file.path))
else: else:
print('File {file} not found.'.format( if dataset.file.necessary:
file=dataset.file.path)) print('File {file} not found.'.format(
file=dataset.file.path))
# What kind of dataset are we dealing with? # What kind of dataset are we dealing with?
if dataset.file.extension == '.md': if dataset.file.extension == '.md':
dataset.type = 'markdown' dataset.type = 'markdown'
...@@ -399,11 +407,12 @@ class DataSet: ...@@ -399,11 +407,12 @@ class DataSet:
# #
if dataset.autoload: if dataset.autoload:
dataset.load() dataset.load()
self.verbose(1, 'Input file : {file}; {props}'.format( if dataset.file.exist:
file=dataset.file.path, self.verbose(1, 'Input file : {file}; {props}'.format(
props=dataset.properties)) file=dataset.file.path,
# High verbosity, dump data. props=dataset.properties))
self.verbose(3, dataset.sheet) # High verbosity, dump data.
self.verbose(3, dataset.sheet)
else: else:
# How to initialize other files # How to initialize other files
dataset.file.suggest(getattr(self.args, datasetname+'_file')) dataset.file.suggest(getattr(self.args, datasetname+'_file'))
...@@ -412,6 +421,4 @@ class DataSet: ...@@ -412,6 +421,4 @@ class DataSet:
else: else:
dataset.type = 'pandas' dataset.type = 'pandas'
dataset.update_properties() dataset.update_properties()
self.verbose(3, 'Output file: {file}; {props}'.format(
file=dataset.file.path, props=dataset.properties))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment