Commit f0f08a46 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Fixed ProcessingFactors dir

parent 74a7ea2f
## Ignore default output files generated by the script.
# User-specific files
ProcessingFactors.zip
ProcessingFactors.csv
ProcessingFactors.xlsx
Mismatches.csv
Report.md
debug_dump_file.xlsx
EU_Processing_Factors_db_P.xlsx.xlsx
# Dirs
Input/
Output/
Build/
__pycache__/
\ No newline at end of file
#!/usr/bin/python
__version_info__ = ('1', '0', '0')
__version__ = '.'.join(__version_info__)
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
import mcra
from dataconversion import DataSet, PY_INDENT, thisyear
import pandas as pd
from datetime import datetime
import textwrap
import os
# Small utility to create hyperlink to hyperlink :-)
def print_as_link(text):
return '[{text}]({text})'.format(text=text)
# These are the files we work with
# Create list
dataset = mcra.DataSet(
opening='(c) ' + datetime.now().strftime('%Y')
dataset = DataSet(
opening='(c) ' + thisyear
+ ' Biometris, Wageningen University and Research.',
description='Converts the EFSA Zendono Excel sheet into an MCRA '
+ 'conforming format, using some external translation files.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
epilog='For example: use %(prog)s -v -x for a verbose example.',
version=__version__)
#
#
efsa_url = 'https://zenodo.org/record/1488653/files/' \
......@@ -92,13 +95,12 @@ dataset.add(
short_argument='-o',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.zip',
# default_name='ProcessingFactors.csv',
default_name='ProcessingFactors.csv',
default_dir='Output')
#
dataset.add(
name='report',
default_name='Report.xlsx',
name='mismatches',
default_name='Mismatches.xlsx',
default_dir='Output')
#
dataset.add(
......@@ -273,13 +275,11 @@ dataset.processing_factor.sheet = efsa_combined[
efsa_combined['FXToProcType'].notna()) &
efsa_combined['idSubstance'].notna()][header]
#
# Writing output file
dataset.processing_factor.close()
# In case of debugging, just dump the sheet we've been working on.
if dataset.args.verbosity > 3:
efsa_combined.mcra.dump(os.path.join(
dataset.report.file.directory, 'dump.xlsx'))
dataset.mismatches.file.directory, 'dump.xlsx'))
#############################################################################
# Phase 4. Report about the data.
......@@ -328,7 +328,7 @@ mismatch_table_string = report_sheet[
header = ['Matrix FoodEx2 Code', 'Matrix Code Interpreted', 'Matrix Code',
'KeyFacets Code', 'KeyFacets Interpreted',
'Number of Matrix FoodEx2 Codes', 'Number of KeyFacets Codes']
dataset.report.sheet = report_sheet[header]
dataset.mismatches.sheet = report_sheet[header]
#
# We also need some further text reporting:
# Let's make a new column of the combination
......@@ -356,7 +356,7 @@ double_types = mismatch_table.groupby(
# report_sheet = mismatch_table.groupby(
# ['KeyFacets Code', 'Matrix FoodEx2 Code']).size()
# #
dataset.report.report = r'''
dataset.mismatches.report = r'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
......@@ -369,16 +369,16 @@ Conversion run details
for data in dataset:
if data.direction == 'Input':
dataset.report.report = textwrap.indent(data.report, mcra.PY_INDENT)
dataset.mismatches.report = textwrap.indent(data.report, PY_INDENT)
for datasetname in dataset.list:
# Bit of a hack, figure out later how this can be properly done.
if getattr(dataset, datasetname).direction == 'Output' \
and datasetname != 'report':
dataset.report.report = textwrap.indent(
getattr(dataset, datasetname).report, mcra.PY_INDENT)
dataset.mismatches.report = textwrap.indent(
getattr(dataset, datasetname).report, PY_INDENT)
dataset.report.report = r'''
dataset.mismatches.report = r'''
EFSA Excel input details
========================
......@@ -425,15 +425,5 @@ Substance conversion duplicates
'''+double_types.to_markdown(index=False)+r'''
'''
dataset.references.close()
dataset.report.save()
# Save this also to the dataset sheet.
with pd.ExcelWriter(dataset.report.file.path, mode='a') as writer:
double_types.to_excel(
writer,
index=False,
sheet_name='Substances')
dataset.report.close(auto_report=False, also_save=False)
dataset.close()
dataset.close(file_report=True)
#!/usr/bin/python
__version_info__ = ('1', '0', '0')
__version__ = '.'.join(__version_info__)
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
......@@ -14,7 +18,8 @@ dataset = DataSet(
' Biometris, Wageningen University and Research.',
description='Creates an MCRA dataset from the ' +
'Processing Factors database on EFSA Zendono.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
epilog='For example: use %(prog)s -v -x for a verbose example.',
version=__version__)
#
# URL source file
......@@ -118,11 +123,11 @@ efsa = efsa.merge(
dataset.processing_factor.sheet = efsa[efsa["idProcessingType"] != "-"]
# Before we can use data from the output files (e.g. hash), we first save it
dataset.save()
# dataset.save()
#############################################################################
# Phase 3. Report about the data.
dataset.report = r'''CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
report = r'''CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
* Script: '''+dataset.scriptname+r'''
......@@ -133,9 +138,9 @@ dataset.report = r'''CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
'''
for data in dataset:
if data.direction == 'Output':
dataset.report += textwrap.indent(data.get_report(), PY_INDENT)
report += textwrap.indent(data.get_report(), PY_INDENT)
dataset.report += r'''
report += r'''
EFSA Excel input details
========================
......@@ -143,7 +148,13 @@ EFSA Excel input details
'''
for data in dataset:
if data.direction == 'Input':
dataset.report += data.get_report()
report += data.get_report()
#
# Writing everthing that's left now
dataset.close()
# Here's a self generated report
# dataset.report = report
# dataset.close()
# Here's an auto generated report
# Uncomment lines 121 and 149, 150 and comment line below
# to get original report back.
dataset.close(file_report=True)
# Ignore everything in this directory
*
# Except this file
!.gitignore
# The default files
!FoodCompositions.xlsx
!FoodTranslations.csv
!ProcessingTypes.csv
!ProcTypeTranslations.csv
!SubstanceTranslations.csv
\ No newline at end of file
......@@ -15,6 +15,10 @@ import math
import sys
import textwrap
import getpass
import re
__version_info__ = ('0', '9', '2')
__version__ = '.'.join(__version_info__)
# For debugging purposes
# from objbrowser import browse
......@@ -114,6 +118,16 @@ class McraAccessor:
elif ext == '.xlsx':
self._obj.to_excel(filename, sheet_name='Dump', index=False)
def dup_reggroups(self, column, regex):
temp_col = column+'__temp__'
dups=self._obj[column].str.extractall(regex)
dups[temp_col]=dups.values.tolist()
dups=dups.reset_index(level=[1])
self._obj=self._obj.join(dups[temp_col]).explode(temp_col).reset_index(drop=True)
self._obj.loc[(self._obj[temp_col].notna()),column]=self._obj[temp_col]
self._obj.drop(columns=temp_col, inplace=True)
return self._obj
class DataFile:
'''
......@@ -251,10 +265,8 @@ class DataSheet:
report = ''
if self.direction == 'Input' or \
(self.direction == 'Output' and self.closed):
report += '* {dir} file: {file}\n'.format(
dir=self.direction, file=os.path.split(self.file.path)[1])
report += textwrap.indent(
'* [{path}]({path})\n'.format(path=self.file.path), PY_INDENT)
filename = os.path.split(self.file.path)[1]
report += f'* {self.direction} file: [{filename}]({filename})\n'
report += textwrap.indent(
'* {props}\n'.format(props=self.properties), PY_INDENT)
report += textwrap.indent(
......@@ -360,44 +372,64 @@ class DataSheet:
# # Save report
# with open(self.file.reportpath, 'w+') as f:
# f.write(self.report)
print('Output file: {file}; {props}'.format(
file=self.file.path,
props=self.properties))
if '-v' in sys.argv or '--verbosity' in sys.argv:
print(f'Output file: {self.file.path}; {self.properties}')
class DataSet:
def __init__(self, opening=None, description=None, epilog=None):
def __init__(self, opening=None, description=None,
epilog=None, version=False):
self.args = None
self.list = []
# Whether or not to create a zip file
self.zip = None
# The report for the entire dataset
self.report = ''
self.runtime = datetime.now().strftime('%H:%M:%S, %d %b %Y')
self.runcommand = ' '.join(sys.argv)
self.runarguments = ' '.join(sys.argv[1:])
self.usedarguments = None
self.scriptname = os.path.split(sys.argv[0])[1]
md5_hash = hashlib.md5()
with open(sys.argv[0], "rb") as f:
# Read and update hash in chunks of 4K
for byte_block in iter(lambda: f.read(4096), b""):
md5_hash.update(byte_block)
self.scripthash = md5_hash.hexdigest()
m=re.match('(.*)\-(?P<noun>.*)\.py', self.scriptname)
if m:
self.scriptnoun = m.group('noun')
else:
self.scriptnoun = self.scriptname.replace('.py','')
self.runuser = getpass.getuser()
self.parser = ArgumentParser(
description=description, epilog=epilog)
report = 'Output\\Report.md'
self.parser.add_argument(
'-r', '--report', nargs='?',
const='Output\\Report.md',
const=report,
default=report,
help='Creates a report file (default: %(const)s).')
# The verbosity argument will accept: -v, or -vv, -vvv etc.
# Set default to 1, so that basic output will always appear.
self.parser.add_argument(
'-v', '--verbosity', help="Show verbose output",
action="count", default=1)
self.parser.add_argument(
'-x', '--example', action='store_const', const='Example',
help='Uses input files from the %(const)s subdir.')
action="count", default=0)
# self.parser.add_argument(
# '-x', '--example', action='store_const', const='Example',
# help='Uses input files from the %(const)s subdir.')
if version:
self.version = version
else:
self.version = __version__
zip = f'Build\\{self.scriptnoun}.{version}.zip'
self.parser.add_argument(
'-z', '--zip', nargs='?', const='Output\\Output.zip',
'-z', '--zip', nargs='?', const=zip, default=zip,
help='Creates a zip file %(const)s containing all output.' +
' (default: %(const)s).')
if '-v' in sys.argv or '--verbosity' in sys.argv:
print(opening)
self.list = []
# Whether or not to create a zip file
self.zip = None
# The report for the entire dataset
self.report = ''
self.runtime = datetime.now().strftime('%H:%M:%S, %d %b %Y')
self.runcommand = ' '.join(sys.argv)
self.runarguments = ' '.join(sys.argv[1:])
self.scriptname = os.path.split(sys.argv[0])[1]
self.runuser = getpass.getuser()
# It is usefull to be able to iterate over all the datasheets.
# Basically, avoid using .list. in all DataSet references.
......@@ -466,6 +498,7 @@ class DataSet:
def init(self):
# Initializes the command line parameters
self.args = self.parser.parse_args()
self.usedarguments = self.args.__dict__
for datasetname in self.list:
dataset = getattr(self, datasetname)
......@@ -473,15 +506,11 @@ class DataSet:
datasetfilename = dataset.file.default_name
else:
datasetfilename = getattr(self.args, datasetname+'_file')
self.usedarguments[datasetname+'_file'] = datasetfilename
if dataset.direction == 'Input':
if self.args.example:
dataset.file.suggest(
datasetfilename,
force_dir=self.args.example)
else:
dataset.file.suggest(
datasetfilename)
dataset.file.suggest(
datasetfilename)
if urlparse(dataset.file.suggested).netloc:
if (not dataset.file.exist) \
or ((dataset.checksum is not None)
......@@ -522,11 +551,22 @@ class DataSet:
base, ext = os.path.splitext(datasetfilename)
dataset.file.suggest(datasetfilename)
dataset.update_properties()
if self.args.zip:
# Create a zip file containing everything
basezip, extzip = os.path.splitext(self.args.zip)
# Setting self.zip indicates creating a zip file
self.zip = basezip+'.zip'
os.makedirs(os.path.dirname(
os.path.abspath(dataset.file.path)), exist_ok=True)
# Make sure we can create the report
os.makedirs(os.path.dirname(
os.path.abspath(self.args.report)), exist_ok=True)
# Always create a zip file containing everything
# First make sure the directory exists
zippath = os.path.dirname(os.path.abspath(self.args.zip))
os.makedirs(zippath, exist_ok=True)
basezip, extzip = os.path.splitext(self.args.zip)
# Setting self.zip indicates creating a zip file
self.zip = basezip+'.zip'
def save(self):
for data in self:
......@@ -545,11 +585,30 @@ class DataSet:
report_content += textwrap.indent(
f'* Command line: {self.runcommand}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Arguments: {self.runarguments}\n', PY_INDENT)
f'* Filename: {self.scriptname}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Command line Arguments: {self.runarguments}\n', PY_INDENT)
report_content += textwrap.indent(
'* Arguments executed:\n', PY_INDENT)
for key, value in self.usedarguments.items():
if value is None:
report_content += textwrap.indent(
f'* --{key}\n', 2*PY_INDENT)
else:
report_content += textwrap.indent(
f'* --{key} {value}\n', 2*PY_INDENT)
report_content += textwrap.indent(
f'* Hash: {self.scripthash}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Executed at: {self.runtime}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Executed by: {self.runuser}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Version: {self.version}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Depends upon module: {__name__}\n', PY_INDENT)
report_content += textwrap.indent(
f'* With version: {__version__}\n', 2*PY_INDENT)
for data in self:
if data.direction == 'Input':
report_content += data.get_report()
......
from argparse import ArgumentParser, SUPPRESS
import pandas as pd
from datetime import datetime
from urllib.parse import urlparse
import os # path, mkdir, walk
import time # ctime
import types
import uuid
import zipfile
import requests
import hashlib
import numpy as np
import math
import sys
import textwrap
# For debugging purposes
# from objbrowser import browse
PY_INDENT = ' '
@pd.api.extensions.register_dataframe_accessor('mcra')
class McraAccessor:
'''
This is an extension of the panda object model.
Some often used functions are added here.
'''
def __init__(self, pandas_obj):
self._obj = pandas_obj
def here_concat(*args):
'''
To easily join two columns
'''
strs = [str(arg) for arg in args if not pd.isnull(arg)]
return '-'.join(strs) if strs else np.nan
self.concat = np.vectorize(here_concat)
def copycolumn(self, columnnames):
'''
To easily copy a bunch of columns
'''
for fromcol, tocol in columnnames.items():
self._obj[tocol] = self._obj[fromcol]
def addcolumn(self, columnnames):
'''
To easily add a bunch of empty columns
'''
for col in columnnames:
self._obj[col] = ''
def splitjoin(self, name, split, join,
split_sep='-', right_split=True, join_sep='-'):
'''
Splits a column, and then joins the result with another column
'''
# Due to the SettingWithCopyWarning we do it a bit cumbersome
df = pd.DataFrame()
df[join] = self._obj[join]
if right_split:
df[split] = self._obj[split].str.rsplit(split_sep, n=1).str[1]
else:
df[split] = self._obj[split].str.rsplit(split_sep, n=1).str[0]
df[name] = df.loc[:, (join, split)].apply(
lambda x: '-'.join(x.dropna()), axis=1)
df = df.drop([join, split], axis=1)
# Not ideal yet, but slightly better than it used to be....
self._obj = self._obj.merge(df, left_index=True, right_index=True)
return self._obj
def join(self, name, join_left, join_right, sep='-'):
'''
joins with another column
'''
# Due to the SettingWithCopyWarning we do it a bit cumbersome
df = pd.DataFrame()
df[[join_left, join_right]] = self._obj[[join_left, join_right]]
df[name] = df.loc[:, (join_left, join_right)].apply(
lambda x: sep.join(x.dropna()), axis=1)
df = df.drop([join_left, join_right], axis=1)
# Not ideal yet, but slightly better than it used to be....
self._obj = self._obj.merge(df, left_index=True, right_index=True)
return self._obj
def dump(self, filename):
'''
For debugging purposes, to dump a file from memory a bit more easily
'''
base, ext = os.path.splitext(filename)
print('Dump file : {file}.'.format(file=filename))
if ext == '.csv':
self._obj.to_csv(path_or_buf=filename, index=False)
elif ext == '.tsv':
self._obj.to_csv(path_or_buf=filename, index=False, sep='\t')
elif ext == '.xlsx':
self._obj.to_excel(filename, sheet_name='Dump', index=False)
class DataFile:
'''
A class to work with the files more streamlined.
Contains technical details just to use the files in a simple manner.
:param default_name: The default name for the file, can also
determine the output naam/sheet.
:param default_dir: The default directory in which to place the file
:param checksum: If given, a file can be checked/reused
:param necessary: Whether the file is necessary or not.
'''
def __init__(self, default_name, default_dir, necessary=True):
self.default_name = default_name
self.default_base = os.path.splitext(self.default_name)[0]
self.default_dir = default_dir
self.path = None
self.directory = None
self.reportpath = None
self.zippath = None
self.suggested = None
self.exist = False
self.modified = ''
self.extension = None
self.size = 0
self.size_string = ''
self.hash = ''
self.hash_short = ''
self.checksum = None
self.necessary = necessary
def update(self):
'''
Updates file properties, e.g. for output files.
'''
if os.path.exists(self.path) and os.path.isfile(self.path):
self.exist = True
self.modified = time.ctime(os.path.getmtime(self.path))
self.size = os.path.getsize(self.path)
self.size_string = self.__converttoprefix(self.size)
self.hash = str(self.__md5_hash())
self.hash_short = self.hash[0:8]
def __converttoprefix(self, bytes):
'''
Private function to have some nice formatting of filesizes
'''
if bytes <= 1024:
return '{0:.0f} B'.format(bytes)
else:
power = math.floor(math.log(bytes, 1024))
factor = math.pow(1024, power)