Commit 0434a490 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Introducted more proper documentation, improved readability of code.

parent 194c4e29
#
#!/usr/bin/python
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
......@@ -16,9 +16,10 @@ def print_as_link(text):
# These are the files we work with
# Create list
dataset = mcra.DataSet(
opening='(c) 2020 Euromix, Biometris, WUR.',
opening='(c) ' + datetime.now().strftime('%Y')
+ ' Biometris, Wageningen University and Research.',
description='Converts the EFSA Zendono Excel sheet into an MCRA '
+ 'conforming format, using some external translation files.',
+ 'conforming format, using some external translation files.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
#
#
......@@ -81,8 +82,7 @@ dataset.add(
default_name='FoodCompositions.xlsx',
necessary=False,
default_dir='Input',
direction='Input',
autoload=False)
direction='Input')
#
# The output files
dataset.add(
......@@ -105,31 +105,21 @@ dataset.add(
default_dir='Output')
#############################################################################
# Phase 1. Initialize the data
#
dataset.init()
#
# Manually load the EFSA sheet, because the data is in a non-trivial place
efsa_sheet = 2
# Find version of sheet.
efsa_version = pd.read_excel(
dataset.efsa.file.path, sheet_name=efsa_sheet,
nrows=1, header=None).iloc[0, 0]
# Load data
dataset.efsa.load(sheet_name=efsa_sheet, header=4)
dataset.verbose(1, 'Input file : {file}; {version}; {props}'.format(
file=dataset.efsa.file.path,
props=dataset.efsa.properties,
version=efsa_version))
#
# Use this file only if called explictly from command line
# and of course, it has to exist. The -g is enough to trigger the default
if dataset.args.food_composition_file is not None \
and dataset.food_composition.file.exist:
dataset.food_composition.load(sheet_name='FoodTranslation')
dataset.verbose(1, 'Input file : {file}; {props}'.format(
file=dataset.food_composition.file.path,
props=dataset.food_composition.properties))
#############################################################################
# Phase 2. Processing the data.
......
......@@ -18,37 +18,51 @@ import sys
# from objbrowser import browse
# We want some additional functions, let's extent the panda object model,
# to easily access these methods.
@pd.api.extensions.register_dataframe_accessor('mcra')
class McraAccessor:
'''
This is an extension of the panda object model.
Some often used functions are added here.
'''
def __init__(self, pandas_obj):
self._obj = pandas_obj
# To easily join two columns
def here_concat(*args):
'''
To easily join two columns
'''
strs = [str(arg) for arg in args if not pd.isnull(arg)]
return '-'.join(strs) if strs else np.nan
self.concat = np.vectorize(here_concat)
# To easily copy a bunch of columns
def copycolumn(self, columnnames):
'''
To easily copy a bunch of columns
'''
for fromcol, tocol in columnnames.items():
self._obj[tocol] = self._obj[fromcol]
# To easily add a bunch of empty columns
def addcolumn(self, columnnames):
'''
To easily add a bunch of empty columns
'''
for col in columnnames:
self._obj[col] = ''
# End of class
# A class to work with the files more streamlined.
# Contains technical details just to use the files in a simple manner.
class DataFile:
def __init__(self, default_name, default_dir, checksum=None,
necessary=True):
'''
A class to work with the files more streamlined.
Contains technical details just to use the files in a simple manner.
:param default_name: The default name for the file, can also
determine the output naam/sheet.
:param default_dir: The default directory in which to place the file
:param checksum: If given, a file can be checked/reused
:param necessary: Whether the file is necessary or not.
'''
def __init__(self, default_name, default_dir, necessary=True):
self.default_name = default_name
self.default_base = os.path.splitext(self.default_name)[0]
self.default_dir = default_dir
......@@ -62,11 +76,13 @@ class DataFile:
self.size_string = ''
self.hash = ''
self.hash_short = ''
self.checksum = checksum
self.checksum = None
self.necessary = necessary
def update(self):
# Updates file properties, e.g. for output files.
'''
Updates file properties, e.g. for output files.
'''
if os.path.exists(self.path) and os.path.isfile(self.path):
self.exist = True
self.modified = time.ctime(os.path.getmtime(self.path))
......@@ -75,8 +91,10 @@ class DataFile:
self.hash = str(self.__md5_hash())
self.hash_short = self.hash[0:8]
# Private function to have some nice formatting of filesizes
def __converttoprefix(self, bytes):
'''
Private function to have some nice formatting of filesizes
'''
if bytes <= 1024:
return '{0:.0f} B'.format(bytes)
else:
......@@ -93,8 +111,10 @@ class DataFile:
return '{0:.0f} {prefix}'.format(
bytes/factor, prefix=prefix[power])
# Returns an MD5 hash of the file; file will be processed
def __md5_hash(self):
'''
Returns an MD5 hash of the file; file will be processed
'''
md5_hash = hashlib.md5()
with open(self.path, "rb") as f:
# Read and update hash in chunks of 4K
......@@ -103,66 +123,58 @@ class DataFile:
return md5_hash.hexdigest()
def suggest(self, name, force_dir=None):
# This is the filename the user suggests on the command line.
# It has to be changed (perhaps) to a proper path
# e.g. if the user only gave a directory
'''
This is the filename the user suggests on the command line.
It has to be changed (perhaps) to a proper path
e.g. if the user only gave a directory
'''
if self.suggested is None:
self.suggested = name
else:
print('double assignment for '+name)
if self.suggested is None:
# No suggestion? Then set default
self.path = os.path.join(
self.default_dir, self.default_name)
else:
if urlparse(self.suggested).netloc:
# Hmmm, you suggest a URL.
# Put in default dir, and use the filename from the URL
urlbase, urlfilename = os.path.split(
urlparse(self.suggested).path)
self.path = os.path.join(
self.default_dir,
os.path.split(urlparse(self.suggested).path)[1])
self.default_dir, urlfilename)
else:
# This should be a filename
head, tail = os.path.split(self.suggested)
if os.path.isdir(self.suggested):
# It is an explicit given directory.
# Use that one, with the default filename
self.path = os.path.join(
self.suggested, self.default_name)
elif tail == self.suggested:
# It is just a filename, no further paths involved.
# Use the default directory
self.path = os.path.join(
self.default_dir, self.suggested)
else:
# No further messing around, just use it:
self.path = self.suggested
if force_dir is not None:
# Force in a certain dir
head, tail = os.path.split(self.path)
self.path = os.path.join(force_dir, tail)
# Finally update all values
base, ext = os.path.splitext(self.path)
self.extension = ext
self.update()
class DataSheet:
# This is just a container for file properties and the pandas sheet.
def __init__(self, default_name, default_dir,
checksum=None, direction='Output', autoload=True,
necessary=True):
self.file = DataFile(
default_name=default_name, default_dir=default_dir,
necessary=necessary)
'''
This is a container for file properties and the pandas sheet.
:param default_name: The default name of the file
:param default_dir: The default directory of the file
:param checksum:
'''
def __init__(self, file, checksum=None, direction='Output',
autoload=True):
self.file = file
self.sheet = None
self.type = None
self.checksum = checksum
self.properties = ''
self.report = ''
# What kind of file we're dealing with. 'Input' or else.
# 'Input' files should be processed at the beginning.
self.direction = direction
self.autoload = autoload
......@@ -192,7 +204,6 @@ class DataSheet:
self.report = self.report+temp
def update_properties(self):
# Set some statistics about the dataframe and file as a string
if self.type == 'pandas':
if self.sheet is not None:
shape = '[{rows} rows x {columns} columns]'.format(
......@@ -207,22 +218,25 @@ class DataSheet:
filesize=self.file.size_string,
hash=self.file.hash_short)
@staticmethod
def supply_defaults(default, **kwargs):
'''
If not in the arguments, these will be defaults.
'''
for key, values in default.items():
if key not in kwargs:
kwargs[key] = values
return kwargs
def load(self, **kwargs):
# Loading file into dataframe
if self.file.exist:
if self.file.extension == '.csv':
# Some defaults:
default_args = {'comment': '#', 'dtype': str}
for key, value in default_args.items():
if key not in kwargs:
kwargs[key] = value
kwargs = self.supply_defaults(
{'comment': '#', 'dtype': str}, **kwargs)
self.sheet = pd.read_csv(self.file.path, **kwargs)
elif self.file.extension == '.tsv':
# Some defaults:
default_args = {'comment': '#', 'dtype': str, 'sep': '\t'}
for key, value in default_args.items():
if key not in kwargs:
kwargs[key] = value
kwargs = self.supply_defaults(
{'comment': '#', 'dtype': str, 'sep': '\t'}, **kwargs)
self.sheet = pd.read_csv(self.file.path, **kwargs)
elif self.file.extension == '.xlsx':
self.sheet = pd.read_excel(self.file.path, **kwargs)
......@@ -237,37 +251,27 @@ class DataSheet:
self.make_report()
def save(self, **kwargs):
# Saves sheet into file
if self.file.extension == '.csv':
if self.type is None:
self.type = 'pandas'
# Some defaults:
default_args = {'index': False}
for key, value in default_args.items():
if key not in kwargs:
kwargs[key] = value
kwargs = self.supply_defaults(
{'index': False}, **kwargs)
self.sheet.to_csv(
path_or_buf=self.file.path,
**kwargs)
elif self.file.extension == '.tsv':
if self.type is None:
self.type = 'pandas'
# Some defaults:
default_args = {'index': False, 'sep': '\t'}
for key, value in default_args.items():
if key not in kwargs:
kwargs[key] = value
kwargs = self.supply_defaults(
{'index': False, 'sep': '\t'}, **kwargs)
self.sheet.to_csv(
path_or_buf=self.file.path,
**kwargs)
elif self.file.extension == '.xlsx':
if self.type is None:
self.type = 'pandas'
# Some defaults:
default_args = {'index': False}
for key, value in default_args.items():
if key not in kwargs:
kwargs[key] = value
kwargs = self.supply_defaults(
{'index': False}, **kwargs)
self.sheet.to_excel(
self.file.path,
sheet_name=self.file.default_base,
......@@ -285,11 +289,8 @@ class DataSheet:
# There we write the .csv:
out_file = os.path.join(
self.file.zippath, self.file.default_base+'.csv')
# Some defaults:
default_args = {'index': False}
for key, value in default_args.items():
if key not in kwargs:
kwargs[key] = value
kwargs = self.supply_defaults(
{'index': False}, **kwargs)
self.sheet.to_csv(
path_or_buf=out_file, **kwargs)
elif self.file.extension == '.md':
......@@ -301,14 +302,12 @@ class DataSheet:
self.update_properties()
def close(self):
# Will close and save a zip file.
if self.file.extension == '.zip':
zip = zipfile.ZipFile(self.file.path, 'w')
for dirname, subdirs, files in os.walk(self.file.zippath):
for filename in files:
zip.write(os.path.join(dirname, filename), filename)
zip.close()
# Remove the tempdir:
shutil.rmtree(self.file.zippath)
self.file.zippath = None
self.file.update()
......@@ -316,7 +315,6 @@ class DataSheet:
self.make_report()
def add_file(self, path, to=None):
# Adds file to zip dir.
if self.file.extension == '.zip':
if to is None:
shutil.copyfile(path,
......@@ -329,9 +327,7 @@ class DataSheet:
class DataSet:
def __init__(self, opening=None, description=None, epilog=None):
# The arguments object to use.
self.args = None
# Delay parsing help, to peek ahead at verbosity...
self.parser = ArgumentParser(
description=description, epilog=epilog)
# The verbosity argument will accept: -v, or -vv, -vvv etc.
......@@ -342,13 +338,12 @@ class DataSet:
self.parser.add_argument(
'-x', '--example', action='store_const', const='Example',
help='Uses input files from the %(const)s subdir.')
# Look ahead to check whether verbosity is used.
if '-v' in sys.argv or '--verbosity' in sys.argv:
print(opening)
# The list of sheets
self.list = []
# It is usefull to be able to iterate over all the datasheets.
# Basically, avoid using .list. in all DataSet references.
def __iter__(self):
self.n = 0
return self
......@@ -363,16 +358,22 @@ class DataSet:
# Container for all the files
def add(self, name, default_name, default_dir, short_argument=None,
checksum=None, help=SUPPRESS, direction='Output', autoload=True,
necessary=True, **kwargs):
necessary=True):
if getattr(self, name, None) is None:
# Create a new sheet with a different name
setattr(self, name, DataSheet(default_name=default_name,
default_dir=default_dir, direction=direction,
checksum=checksum, autoload=autoload,
necessary=necessary, **kwargs))
# Add to list
setattr(self,
name,
DataSheet(
file=DataFile(
default_name=default_name,
default_dir=default_dir,
necessary=necessary),
direction=direction,
checksum=checksum,
autoload=autoload))
self.list.append(name)
# Also set-up arguments if necessary
long_argument = '--'+name+'_file'
if type(help) == str and help is not SUPPRESS:
help = help + ' (default: {default})'.format(
......@@ -394,11 +395,11 @@ class DataSet:
help=SUPPRESS)
def __download(self, url, file):
# Downloads url to file
self.verbose(
1, 'Downloading: {url} to {file}'.format(url=url, file=file))
myfile = requests.get(url, allow_redirects=True)
open(file, 'wb').write(myfile.content)
with open(file, 'wb') as download:
download.write(myfile.content)
def verbose(self, level, message):
if self.args.verbosity >= level:
......@@ -407,52 +408,41 @@ class DataSet:
def init(self):
# Initializes the command line parameters
self.args = self.parser.parse_args()
# Go through all files and set filenames
for datasetname in self.list:
dataset = getattr(self, datasetname)
if getattr(self.args, datasetname+'_file') is None:
# Argument was not used
datasetfilename = dataset.file.default_name
else:
datasetfilename = getattr(self.args, datasetname+'_file')
#
if dataset.direction == 'Input':
if self.args.example:
# If example was called
# make default dir the Example dir
dataset.file.suggest(
datasetfilename,
force_dir=self.args.example)
else:
dataset.file.suggest(
datasetfilename)
# File has proper name, load files
if urlparse(dataset.file.suggested).netloc:
if not dataset.file.exist:
# Download file
if (not dataset.file.exist)
or ((dataset.checksum is not None)
and dataset.file.hash != dataset.checksum):
self.__download(
dataset.file.suggested, dataset.file.path)
else:
if dataset.checksum is not None:
if dataset.file.hash != dataset.checksum:
self.__download(
dataset.file.suggested, dataset.file.path)
dataset.file.update()
url=dataset.file.suggested,
file=dataset.file.path)
dataset.file.update()
if dataset.file.exist:
if dataset.checksum is not None:
if dataset.file.hash != dataset.checksum:
print('File {file} has improper checksum'.format(
file=dataset.file.path))
if dataset.checksum is not None \
and dataset.file.hash != dataset.checksum:
print('File {file} has improper checksum'.format(
file=dataset.file.path))
else:
if dataset.file.necessary:
print('File {file} not found.'.format(
file=dataset.file.path))
# What kind of dataset are we dealing with?
if dataset.file.extension == '.md':
dataset.type = 'markdown'
else:
dataset.type = 'pandas'
#
dataset.type = 'pandas'
if dataset.autoload:
if getattr(self.args, datasetname+'_file') is None \
and not dataset.file.necessary:
......@@ -471,7 +461,6 @@ class DataSet:
# High verbosity, dump data.
self.verbose(3, dataset.sheet)
else:
# How to initialize other files
dataset.file.suggest(datasetfilename)
if dataset.file.extension == '.md':
dataset.type = 'markdown'
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment