Skip to content
Snippets Groups Projects
Commit a7d83e87 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Substance translation incorporated.

parent 18f1e656
Branches
No related tags found
No related merge requests found
......@@ -17,7 +17,7 @@ def print_as_link(text):
dataset = mcra.DataSet(
description='Converts the EFSA Zendono Excel sheet into an MCRA '
+ 'conforming format, using some external translation files.',
epilog='For example: use %(prog)s -v for verbose output.')
epilog='For example: use %(prog)s -v -x for a verbose example.')
#
#
efsa_url = 'https://zenodo.org/record/1488653/files/' \
......@@ -62,13 +62,24 @@ dataset.add(
default_dir='Input',
direction='Input')
#
dataset.add(
name='substance_translation',
short_argument='-s',
help='The (input) substance translation file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)',
default_name='SubstanceTranslations.tsv',
necessary = False,
default_dir='Input',
direction='Input')
#
# The output files
dataset.add(
name='processing_factor',
short_argument='-o',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated). (default: %(default)s)',
default_name='ProcessingFactors.zip',
# default_name='ProcessingFactors.zip',
default_name='ProcessingFactors.csv',
default_dir='Output')
#
dataset.add(
......@@ -124,8 +135,7 @@ efsa_combined = dataset.efsa.sheet.merge(
# First let's copy the columns which we want in the output unaltered
# So this is a copy FROM FIELD : TO FIELD
efsa_combined.mcra.copycolumn({'ParamCode Active Substance': 'idSubstance',
'ParamName Active Substance': 'SubstanceName',
efsa_combined.mcra.copycolumn({'ParamName Active Substance': 'SubstanceName',
'Matrix Code': 'idFoodUnProcessed',
'Raw Primary Commodity': 'FoodUnprocessedName',
'Median PF': 'Nominal'})
......@@ -154,6 +164,26 @@ efsa_combined.loc[
'idProcessingType'] = efsa_combined['FXToProcType']
#
# See whether we can do something with FoodSubstances
if dataset.substance_translation.sheet is None:
# No substance translation? Just copy column
efsa_combined.mcra.copycolumn(
{'ParamCode Active Substance': 'idSubstance'})
else:
# Strip dash (-) from the CASNumber column
dataset.substance_translation.sheet['CASNumber'].replace(
'-', '', regex=True, inplace=True)
# Do a left join
efsa_combined = efsa_combined.merge(
# Left join with processing type sheet,
dataset.substance_translation.sheet,
left_on='ParamCode Active Substance', right_on='code',
how='left').assign()
# Copy CASNumber to idSubstance column
efsa_combined.mcra.copycolumn(
{'CASNumber': 'idSubstance'})
###############################################
# Request by Waldo, please also add the description of the Processing Type.
# So, again, a left join :-)
......@@ -185,8 +215,9 @@ header = ['idProcessingType', 'idSubstance', 'SubstanceName',
'UpperUncertaintyUpper', 'KeyFacets Interpreted',
'Matrix Code Interpreted', 'MCRA_ProcessingType_Description']
dataset.processing_factor.sheet = efsa_combined[
efsa_combined['FCToProcType'].notna() |
efsa_combined['FXToProcType'].notna()][header]
(efsa_combined['FCToProcType'].notna() |
efsa_combined['FXToProcType'].notna()) &
efsa_combined['idSubstance'].notna()][header]
#
# Writing output file
dataset.processing_factor.save()
......@@ -334,13 +365,17 @@ dataset.verbose(
if dataset.processing_factor.file.extension == '.zip':
# Now, let's sneak in the report before we save
dataset.processing_factor.add_file(dataset.report.file.path, 'README.md')
filename_in_zip = 'README.md'
dataset.processing_factor.add_file(
dataset.report.file.path, filename_in_zip)
dataset.processing_factor.close()
dataset.verbose(
1, 'Output file: {file}; {props} {report} enclosed in zipfile.'.format(
file=dataset.processing_factor.file.path,
props=dataset.processing_factor.properties,
report=dataset.report.file.path))
1,
'Output file: {f}; {p} {r} enclosed in zipfile as {zf}.'.format(
f=dataset.processing_factor.file.path,
p=dataset.processing_factor.properties,
r=dataset.report.file.path,
zf=filename_in_zip))
else:
dataset.processing_factor.close()
dataset.verbose(
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -39,7 +39,8 @@ class McraAccessor:
# A class to work with the files more streamlined.
# Contains technical details just to use the files in a simple manner.
class DataFile:
def __init__(self, default_name, default_dir, checksum=None):
def __init__(self, default_name, default_dir, checksum=None,
necessary=True):
self.default_name = default_name
self.default_base = os.path.splitext(self.default_name)[0]
self.default_dir = default_dir
......@@ -54,6 +55,7 @@ class DataFile:
self.hash = ''
self.hash_short = ''
self.checksum = checksum
self.necessary = necessary
def update(self):
# Updates file properties, e.g. for output files.
......@@ -141,9 +143,11 @@ class DataFile:
class DataSheet:
# This is just a container for file properties and the pandas sheet.
def __init__(self, default_name, default_dir,
checksum=None, direction='Output', autoload=True):
checksum=None, direction='Output', autoload=True,
necessary=True):
self.file = DataFile(
default_name=default_name, default_dir=default_dir)
default_name=default_name, default_dir=default_dir,
necessary=necessary)
self.sheet = None
self.type = None
self.checksum = checksum
......@@ -153,6 +157,7 @@ class DataSheet:
self.direction = direction
self.autoload = autoload
def update_properties(self):
# Return some statistics about the dataframe and file as a string
if self.type == 'pandas':
......@@ -273,6 +278,7 @@ class DataSheet:
shutil.rmtree(self.file.zippath)
self.file.zippath = None
self.file.update()
self.update_properties()
def add_file(self, path, to=None):
# Adds file to zip dir.
......@@ -318,12 +324,13 @@ class DataSet:
# Container for all the files
def add(self, name, default_name, default_dir, short_argument=None,
checksum=None, help=SUPPRESS, direction='Output', autoload=True,
**kwargs):
necessary=True, **kwargs):
if getattr(self, name, None) is None:
# Create a new sheet with a different name
setattr(self, name, DataSheet(default_name=default_name,
default_dir=default_dir, direction=direction,
checksum=checksum, autoload=autoload, **kwargs))
checksum=checksum, autoload=autoload,
necessary=necessary,**kwargs))
# Add to list
self.list.append(name)
# Also set-up arguments if necessary
......@@ -389,8 +396,9 @@ class DataSet:
print('File {file} has improper checksum'.format(
file=dataset.file.path))
else:
print('File {file} not found.'.format(
file=dataset.file.path))
if dataset.file.necessary:
print('File {file} not found.'.format(
file=dataset.file.path))
# What kind of dataset are we dealing with?
if dataset.file.extension == '.md':
dataset.type = 'markdown'
......@@ -399,11 +407,12 @@ class DataSet:
#
if dataset.autoload:
dataset.load()
self.verbose(1, 'Input file : {file}; {props}'.format(
file=dataset.file.path,
props=dataset.properties))
# High verbosity, dump data.
self.verbose(3, dataset.sheet)
if dataset.file.exist:
self.verbose(1, 'Input file : {file}; {props}'.format(
file=dataset.file.path,
props=dataset.properties))
# High verbosity, dump data.
self.verbose(3, dataset.sheet)
else:
# How to initialize other files
dataset.file.suggest(getattr(self.args, datasetname+'_file'))
......@@ -412,6 +421,4 @@ class DataSet:
else:
dataset.type = 'pandas'
dataset.update_properties()
self.verbose(3, 'Output file: {file}; {props}'.format(
file=dataset.file.path, props=dataset.properties))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment