Commit 3c3588fb authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Added metadata.json output to zip file.

parent f5051762
#!/usr/bin/python
__version_info__ = ('1', '1', '0')
__version_info__ = ('1', '1', '1')
__version__ = '.'.join(__version_info__)
#############################################################################
......@@ -100,6 +100,7 @@ dataset.add(
dataset.add(
name='processing_factor',
short_argument='-p',
description='During conversion we using processing factors from this table.',
help='The (output) processing factor file - '
+ 'format: csv (Comma Seperated).',
default_name='ProcessingFactors.csv',
......
......@@ -16,8 +16,9 @@ import sys
import textwrap
import getpass
import re
import json
__version_info__ = ('0', '9', '3')
__version_info__ = ('0', '9', '4')
__version__ = '.'.join(__version_info__)
# For debugging purposes
......@@ -159,8 +160,10 @@ class DataFile:
self.size_string = ''
self.hash = ''
self.hash_short = ''
self.sha256 = ''
self.checksum = None
self.necessary = necessary
self.fileformat = ''
def update(self):
'''
......@@ -173,6 +176,7 @@ class DataFile:
self.size_string = self.__converttoprefix(self.size)
self.hash = str(self.__md5_hash())
self.hash_short = self.hash[0:8]
self.sha256 = str(self.__sha256_hash())
def __converttoprefix(self, bytes):
'''
......@@ -205,6 +209,17 @@ class DataFile:
md5_hash.update(byte_block)
return md5_hash.hexdigest()
def __sha256_hash(self):
'''
Returns an MD5 hash of the file; file will be processed
'''
sha256_hash = hashlib.sha256()
with open(self.path, "rb") as f:
# Read and update hash in chunks of 4K
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def suggest(self, name, force_dir=None):
'''
This is the filename the user suggests on the command line.
......@@ -266,6 +281,7 @@ class DataSheet:
self.autoload = autoload
self.inzip = inzip
self.closed = False
self.json = ''
def get_report(self):
report = ''
......@@ -289,7 +305,9 @@ class DataSheet:
size_str=self.file.size_string,
size=self.file.size), PY_INDENT)
report += textwrap.indent(
'* Hash: {hash}\n\n'.format(hash=self.file.hash), PY_INDENT)
'* Hash (MD5): {hash}\n'.format(hash=self.file.hash), PY_INDENT)
report += textwrap.indent(
'* Hash (SHA256): {hash}\n\n'.format(hash=self.file.sha256), PY_INDENT)
return report
def update_properties(self):
......@@ -318,20 +336,25 @@ class DataSheet:
kwargs = self.supply_defaults(
{'comment': '#', 'dtype': str}, **kwargs)
self.sheet = pd.read_csv(self.file.path, **kwargs)
self.file.fileformat = 'text/csv'
elif self.file.extension == '.tsv':
kwargs = self.supply_defaults(
{'comment': '#', 'dtype': str, 'sep': '\t'}, **kwargs)
self.sheet = pd.read_csv(self.file.path, **kwargs)
self.file.fileformat = 'text/csv'
elif self.file.extension == '.xlsx':
self.sheet = pd.read_excel(self.file.path, **kwargs)
self.file.fileformat = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
elif self.file.extension == '.xls':
# Suppress warnings
wb = xlrd.open_workbook(self.file.path,
logfile=open(os.devnull, 'w'))
self.sheet = pd.read_excel(wb, engine='xlrd')
self.file.fileformat = 'application/vnd.ms-excel'
elif self.file.extension == '.md':
f = open(self.file.path, 'r')
self.sheet = f.read()
self.file.fileformat = 'text/markdown'
f.close()
else:
# Error here
......@@ -402,16 +425,20 @@ class DataSet:
self.report = ''
self.description = description
self.runtime = datetime.now().strftime('%H:%M:%S, %d %b %Y')
self.dateCreated = datetime.now().strftime('%Y-%m-%d')
self.runcommand = ' '.join(sys.argv)
self.runarguments = ' '.join(sys.argv[1:])
self.usedarguments = None
self.scriptname = os.path.split(sys.argv[0])[1]
md5_hash = hashlib.md5()
sha256_hash = hashlib.sha256()
with open(sys.argv[0], "rb") as f:
# Read and update hash in chunks of 4K
for byte_block in iter(lambda: f.read(4096), b""):
md5_hash.update(byte_block)
sha256_hash.update(byte_block)
self.scripthash = md5_hash.hexdigest()
self.scripthash_sha256 = sha256_hash.hexdigest()
m=re.match('(.*)\-(?P<noun>.*)\.py', self.scriptname)
if m:
self.scriptnoun = m.group('noun')
......@@ -444,9 +471,16 @@ class DataSet:
'-z', '--zip', nargs='?', const=zip, default=zip,
help='Creates a zip file %(const)s containing all output.' +
' (default: %(const)s).')
self.parser.add_argument(
'-n', '--noscript', action='store_true', default=False,
help='Does NOT store the script in the zip file')
r=re.compile('^-v+$')
if list(filter(r.match, sys.argv)) or '--verbosity' in sys.argv:
print(opening)
if opening is not None:
self.opening = opening
else:
self.opening = ''
# It is usefull to be able to iterate over all the datasheets.
# Basically, avoid using .list. in all DataSet references.
......@@ -634,6 +668,16 @@ class DataSet:
if data.direction == 'Output':
if not data.closed:
self.verbose(1,f'Output file: {data.file.path}; {data.properties}')
if data.file.extension == '.csv':
data.file.fileformat = 'text/csv'
elif data.file.extension == '.tsv':
data.file.fileformat = 'text/csv'
elif data.file.extension == '.xlsx':
data.file.fileformat = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
elif data.file.extension == '.xls':
data.file.fileformat = 'application/vnd.ms-excel'
elif data.file.extension == '.md':
data.file.fileformat = 'text/markdown'
data.close(auto_report=file_report, also_save=True)
......@@ -663,7 +707,9 @@ class DataSet:
report_content += textwrap.indent(
f'* --{key} {value}\n', 2*PY_INDENT)
report_content += textwrap.indent(
f'* Hash: {self.scripthash}\n', PY_INDENT)
f'* Hash (MD5): {self.scripthash}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Hash (SHA256): {self.scripthash_sha256}\n', PY_INDENT)
report_content += textwrap.indent(
f'* Executed at: {self.runtime}\n', PY_INDENT)
report_content += textwrap.indent(
......@@ -680,24 +726,45 @@ class DataSet:
# Closing every sheet first
self.save(file_report=file_report, save=save)
# Collect reports per sheet.
for data in self:
report_content += data.report
if len(report_content) > 0:
# Report contains information.
if len(self.report) > 0:
self.report += '\n'
self.report += report_content
if self.args.report:
# Collect reports per sheet.
for data in self:
report_content += data.report
if len(report_content) > 0:
# Report contains information.
if len(self.report) > 0:
self.report += '\n'
self.report += report_content
if len(self.report) > 0:
# Save report
with open(self.args.report, 'w+') as f:
f.write(self.report)
self.reportname = self.args.report
md5_hash = hashlib.md5()
sha256_hash = hashlib.sha256()
with open(self.reportname, "rb") as f:
# Read and update hash in chunks of 4K
for byte_block in iter(lambda: f.read(4096), b""):
md5_hash.update(byte_block)
sha256_hash.update(byte_block)
self.reporthash = md5_hash.hexdigest()
self.reporthash_sha256 = sha256_hash.hexdigest()
self.reportfileformat = 'text/markdown'
self.verbose(
1,
'Output file: {file}, containing report on output.'.format(
file=self.args.report))
# Creating metadata.json file
metadata = dict()
metadata['@context'] = "http://schema.org/"
metadata['@type'] = "Dataset"
metadata['name'] = self.description
metadata['description'] = self.description
metadata['author'] = [ { 'name' : f'{self.scriptname}'}, { 'affiliation' : f'{self.opening}' }]
metadata['dateCreated'] = self.dateCreated
metadata['distribution'] = []
if self.zip:
# All output files will be added to the zip file
zip = zipfile.ZipFile(self.zip, 'w')
......@@ -706,14 +773,52 @@ class DataSet:
filename = os.path.split(data.file.path)[1]
if data.inzip is None or data.inzip == True:
zip.write(data.file.path, filename)
# Appending to metadata
metadata['distribution'] += [{ '@type' : 'Datadownload', 'name' : f'{filename}',
'description' : f'{data.description}',
'fileFormat' : f'{data.file.fileformat}', 'contentSize' : f'{data.file.size_string}',
'sha256' : f'{data.file.sha256}'}]
elif data.direction == 'Input':
filename = 'Input\\'+os.path.split(data.file.path)[1]
filename = 'Input/'+os.path.split(data.file.path)[1]
if data.inzip == True:
zip.write(data.file.path, filename)
# Appending to metadata
metadata['distribution'] += [{ '@type' : 'Datadownload', 'name' : f'{filename}',
'description' : f'{data.description}',
'fileFormat' : f'{data.file.fileformat}', 'contentSize' : f'{data.file.size_string}',
'sha256' : f'{data.file.sha256}'}]
if self.args.report:
filename = os.path.split(self.args.report)[1]
zip.write(self.args.report, filename)
# Appending to metadata
metadata['distribution'] += [{ '@type' : 'Datadownload', 'name' : f'{filename}',
'description' : f'Report of this conversion.',
'fileFormat' : f'{self.reportfileformat}',
'sha256' : f'{self.reporthash_sha256}'}]
if not self.args.noscript:
# Adding script to the zipfile
zip.write(sys.argv[0], self.scriptname)
# Appending to metadata
metadata['distribution'] += [{ '@type' : 'Datadownload', 'name' : f'{self.scriptname}',
'description' : f'Script for conversion of {self.scriptnoun}, using version {__version__} of module {__name__}.',
'fileFormat' : 'text/x-python',
'sha256' : f'{self.scripthash_sha256}'}]
for data in self:
if data.direction == 'Output':
# We use the outputdir of the first outputfilename
metadatafilename = 'metadata.json'
metadatapath = os.path.join(os.path.split(data.file.path)[0],metadatafilename)
break
self.verbose(1, f'Output file: {metadatapath}, containing metadata.')
with open(metadatapath, "w") as metadatafile:
json.dump(metadata, metadatafile, indent=4, sort_keys=True)
# Adding metadata to zip file
zip.write(metadatapath, metadatafilename)
# All set, close zip file
zip.close()
self.verbose(
1,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment