Commit 8d082b20 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Updated scripts with new features; inzip and fixed bugs

parent 11b2c2ba
#!/usr/bin/python
__version_info__ = ('1', '0', '3')
__version_info__ = ('1', '0', '4')
__version__ = '.'.join(__version_info__)
#############################################################################
......@@ -18,7 +18,7 @@ dataset = DataSet(
' Biometris, Wageningen University and Research.',
description='Creates an MCRA dataset from the ' +
'Processing Factors database on EFSA Zendono.',
epilog='For example: use %(prog)s -v -x for a verbose example.',
epilog='For example: use %(prog)s -h for help.',
version=__version__)
#
......@@ -31,6 +31,7 @@ efsa_url = 'https://zenodo.org/record/1488653/files/' \
dataset.add(
name='efsa',
short_argument='-e',
description='This is the EFSA file with all the processing factors.',
help='The EFSA Zendono Excel sheet (.xlsx); either file or URL. ',
checksum='f816bf3928431d54f9d15fb134cc9106',
default_name=efsa_url,
......@@ -43,13 +44,14 @@ dataset.add(
dataset.add(
name='substance_translation',
short_argument='-s',
description='During conversion we have translated substances using this file.',
help='The (input) substance translation file - '
+ 'format: csv (Comma Seperated), file not required.',
default_name='',
necessary=False,
default_name=None,
necessary=False, # Make default_name None and necessary False
default_dir='Input',
inzip=True, # Copy this file into the zip
direction='Input')
#
# The output files
......@@ -69,6 +71,7 @@ dataset.add(
help='The (output) references file - '
+ 'format: csv (Comma Seperated).',
default_name='References.csv',
description='This is just a copy of the references from the input file.',
default_dir='Output')
#
......@@ -115,22 +118,34 @@ efsa.mcra.copycolumn({
})
# See whether we can do something with Substance translation
if dataset.substance_translation.sheet is None:
if not dataset.exists('substance_translation'):
# No substance translation? Just copy column
efsa.mcra.copycolumn(
{'ParamCode Active Substance': 'idSubstance'})
else:
# Get the from-code
dataset.substance_translation.sheet['FromCode']
# Do a left join
efsa = efsa.merge(
# Left join with processing type sheet,
dataset.substance_translation.sheet,
left_on='idSubstance', right_on='FromCode',
how='left').assign()
# Copy CASNumber to idSubstance column
efsa.mcra.copycolumn({'ToCode': 'idSubstance'})
if 'CASNumber' in dataset.substance_translation.sheet.columns:
# This is the "old" situation
# Strip dash (-) from the CASNumber column
dataset.substance_translation.sheet['CASNumber'].replace(
'-', '', regex=True, inplace=True)
# Do a left join
efsa = efsa.merge(
# Left join with processing type sheet,
dataset.substance_translation.sheet,
left_on='ParamCode Active Substance', right_on='code',
how='left').assign()
# Copy CASNumber to idSubstance column
efsa.mcra.copycolumn(
{'CASNumber': 'idSubstance'})
else:
# This is as Johannes is using it.
# Do a left join on FromCode
efsa = efsa.merge(
dataset.substance_translation.sheet,
left_on='idSubstance', right_on='FromCode',
how='left').assign()
# Copy ToCode to idSubstance column
efsa.mcra.copycolumn({'ToCode': 'idSubstance'})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
......@@ -170,5 +185,8 @@ dataset.processing_factor.sheet = efsa[
#############################################################################
# Phase 3. Report about the data.
# This is the way to go if you want to know if a sheet exists...
# if dataset.exists('substance_translation'):
# print('Yahoo')
# Here's an auto generated report
dataset.close(file_report=True)
dataset.close()
......@@ -17,7 +17,7 @@ import textwrap
import getpass
import re
__version_info__ = ('0', '9', '2')
__version_info__ = ('0', '9', '3')
__version__ = '.'.join(__version_info__)
# For debugging purposes
......@@ -142,7 +142,10 @@ class DataFile:
'''
def __init__(self, default_name, default_dir, necessary=True):
self.default_name = default_name
self.default_base = os.path.splitext(self.default_name)[0]
if default_name is not None:
self.default_base = os.path.splitext(self.default_name)[0]
else:
self.default_base = None
self.default_dir = default_dir
self.path = None
self.directory = None
......@@ -251,14 +254,17 @@ class DataSheet:
:param checksum:
'''
def __init__(self, file, checksum=None, direction='Output',
autoload=True):
description='', title='', autoload=True, inzip=None):
self.file = file
self.sheet = None
self.checksum = checksum
self.properties = ''
self.report = ''
self.description = description
self.title = title
self.direction = direction
self.autoload = autoload
self.inzip = inzip
self.closed = False
def get_report(self):
......@@ -266,6 +272,12 @@ class DataSheet:
if self.direction == 'Input' or \
(self.direction == 'Output' and self.closed):
filename = os.path.split(self.file.path)[1]
if self.title is not None:
report_title = f'{self.title}'
report += f'{report_title}\n'
report += len(report_title)*'-'+'\n\n'
if self.description:
report += self.description+'\n\n'
report += f'* {self.direction} file: [{filename}]({filename})\n'
report += textwrap.indent(
'* {props}\n'.format(props=self.properties), PY_INDENT)
......@@ -277,7 +289,7 @@ class DataSheet:
size_str=self.file.size_string,
size=self.file.size), PY_INDENT)
report += textwrap.indent(
'* Hash: {hash}\n'.format(hash=self.file.hash), PY_INDENT)
'* Hash: {hash}\n\n'.format(hash=self.file.hash), PY_INDENT)
return report
def update_properties(self):
......@@ -372,8 +384,8 @@ class DataSheet:
# # Save report
# with open(self.file.reportpath, 'w+') as f:
# f.write(self.report)
if '-v' in sys.argv or '--verbosity' in sys.argv:
print(f'Output file: {self.file.path}; {self.properties}')
# if '-v' in sys.argv or '--verbosity' in sys.argv:
# print(f'Output file: {self.file.path}; {self.properties}')
class DataSet:
......@@ -385,6 +397,7 @@ class DataSet:
self.zip = None
# The report for the entire dataset
self.report = ''
self.description = description
self.runtime = datetime.now().strftime('%H:%M:%S, %d %b %Y')
self.runcommand = ' '.join(sys.argv)
self.runarguments = ' '.join(sys.argv[1:])
......@@ -428,7 +441,8 @@ class DataSet:
'-z', '--zip', nargs='?', const=zip, default=zip,
help='Creates a zip file %(const)s containing all output.' +
' (default: %(const)s).')
if '-v' in sys.argv or '--verbosity' in sys.argv:
r=re.compile('^-v+$')
if list(filter(r.match, sys.argv)) or '--verbosity' in sys.argv:
print(opening)
# It is usefull to be able to iterate over all the datasheets.
......@@ -447,10 +461,15 @@ class DataSet:
# Container for all the files
def add(self, name, default_name, default_dir, short_argument=None,
checksum=None, help=SUPPRESS, direction='Output', autoload=True,
necessary=True):
necessary=True, inzip=None, description='', title=''):
if getattr(self, name, None) is None:
# Create a new sheet with a different name
# Create a new sheet with
# directly under this class. Then no .list is needed
if default_name == '':
default_name = None
if title is not None and title == '':
title = direction.title() + ' ' + \
name.replace('_',' ').title()
setattr(self,
name,
DataSheet(
......@@ -460,12 +479,17 @@ class DataSet:
necessary=necessary),
direction=direction,
checksum=checksum,
description=description,
title=title,
inzip=inzip,
autoload=autoload))
# But we must do some bookkeeping
self.list.append(name)
# ToDo: Argument met hoofdletters, dan In ZIP file meenemen
long_argument = '--' + name + '_file'
if type(help) == str and help is not SUPPRESS:
if type(help) == str and help is not SUPPRESS and \
default_name is not None:
help = help + ' (default: {default})'.format(
default=default_name)
if short_argument is None:
......@@ -495,11 +519,42 @@ class DataSet:
if self.args.verbosity >= level:
print(message)
def exists(self, name):
try:
dataset = getattr(self, name)
except:
dataset = False
if not dataset:
return False
else:
if dataset.sheet is None:
return False
else:
return True
def init(self):
# Initializes the command line parameters
self.args = self.parser.parse_args()
self.usedarguments = self.args.__dict__
# Remove all input datasets not explicitly called.
removelist = []
for datasetname in self.list:
dataset = getattr(self, datasetname)
if not dataset.file.necessary and \
getattr(self.args, datasetname+'_file') is None and \
dataset.file.default_name is None:
removelist += [datasetname]
for datasetname in removelist:
dataset = getattr(self, datasetname)
self.verbose(3, f'Not loading {datasetname}_file.')
self.list.remove(datasetname)
delattr(self, datasetname)
del self.usedarguments[datasetname+'_file']
for datasetname in self.list:
dataset = getattr(self, datasetname)
if getattr(self.args, datasetname+'_file') is None:
......@@ -568,24 +623,27 @@ class DataSet:
# Setting self.zip indicates creating a zip file
self.zip = basezip+'.zip'
def save(self):
def save(self, file_report=False, save=True):
for data in self:
if data.direction == 'Output':
if not data.closed:
data.close(auto_report=False, also_save=True)
data.close(auto_report=file_report, also_save=True)
self.verbose(1,f'Output file: {data.file.path}; {data.properties}')
def close(self, file_report=False, save=True):
def close(self, file_report=True, save=True):
'''
Method to close the dataset.
Most importantly save files.
'''
report_content = ''
if file_report:
report_content += f'* Script: {self.scriptname}\n'
report_title = 'Conversion properties'
report_content += f'{report_title}\n'
report_content += '-'*len(report_title)+'\n\n'
report_content += textwrap.indent(
f'* Command line: {self.runcommand}\n', PY_INDENT)
f'* Scriptname: {self.scriptname}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Filename: {self.scriptname}\n', PY_INDENT)
f'* Command line: {self.runcommand}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Command line Arguments: {self.runarguments}\n', PY_INDENT)
report_content += textwrap.indent(
......@@ -608,15 +666,12 @@ class DataSet:
report_content += textwrap.indent(
f'* Depends upon module: {__name__}\n', PY_INDENT)
report_content += textwrap.indent(
f'* With version: {__version__}\n', 2*PY_INDENT)
f'* With version: {__version__}\n\n', 2*PY_INDENT)
for data in self:
if data.direction == 'Input':
report_content += data.get_report()
# Closing every sheet first
for data in self:
if data.direction == 'Output':
if not data.closed:
data.close(auto_report=file_report, also_save=save)
self.save(file_report=file_report, save=save)
if self.args.report:
# Collect reports per sheet.
......@@ -642,7 +697,13 @@ class DataSet:
for data in self:
if data.direction == 'Output':
filename = os.path.split(data.file.path)[1]
zip.write(data.file.path, filename)
if data.inzip is None or data.inzip == True:
zip.write(data.file.path, filename)
elif data.direction == 'Input':
filename = 'Input\\'+os.path.split(data.file.path)[1]
if data.inzip == True:
zip.write(data.file.path, filename)
if self.args.report:
filename = os.path.split(self.args.report)[1]
zip.write(self.args.report, filename)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment