diff --git a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py index 14713dc923d791997725985979deef9da0a8062c..cac53f5e7b37d03b6ebff8e96d10a943ebf260e4 100644 --- a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py +++ b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py @@ -17,6 +17,7 @@ import shutil import requests import hashlib import numpy as np +import math # For debugging purposes # from objbrowser import browse @@ -32,6 +33,37 @@ class MCRAFile: self.exist = False self.modified = '' self.extension = None + self.size = 0 + self.size_string = '' + self.hash = '' + self.hash_short = '' + + # Private function to have some nice formatting of filesizes + def __converttoprefix(self, bytes): + if bytes <= 1024: + return '{0:.0f} B'.format(bytes) + else: + power = math.floor(math.log(bytes, 1024)) + factor = math.pow(1024, power) + prefix = ['B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] + if round(bytes/factor, 1) < 10: + return '{0:.1f} {prefix}'.format( + bytes/factor, prefix=prefix[power]) + elif round(bytes/factor, 1) >= 1000: + return '{0:.2f} {prefix}'.format( + bytes/(1024*factor), prefix=prefix[power+1]) + else: + return '{0:.0f} {prefix}'.format( + bytes/factor, prefix=prefix[power]) + + # Returns an MD5 hash of the file; file will be processed + def __md5_hash(self): + md5_hash = hashlib.md5() + with open(self.path, "rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + md5_hash.update(byte_block) + return md5_hash.hexdigest() def suggest(self, name): # This is the filename the user suggests on the command line. @@ -69,18 +101,12 @@ class MCRAFile: if os.path.exists(self.path) and os.path.isfile(self.path): self.exist = True self.modified = time.ctime(os.path.getmtime(self.path)) + self.size = os.path.getsize(self.path) + self.size_string = self.__converttoprefix(self.size) base, ext = os.path.splitext(self.path) self.extension = ext - - # Returns an MD5 hash of the file; file will be processed - def md5_hash(self): - md5_hash = hashlib.md5() - with open(self.path, "rb") as f: - # Read and update hash in chunks of 4K - for byte_block in iter(lambda: f.read(4096), b""): - md5_hash.update(byte_block) - return md5_hash.hexdigest() - + self.hash = str(self.__md5_hash()) + self.hash_short = self.hash[0:8] # We want some additional functions, let's extent the panda object model, # to easily access these methods. @@ -117,6 +143,16 @@ class MCRAWorksheet: default_name=default_name, default_dir=default_dir) self.sheet = None + def properties(self): + # Return some statistics about the dataframe and file as a string + # A few variables to make things more readable + shape = '[{rows} rows x {columns} columns]'.format( + rows=str(self.sheet.shape[0]), columns=str(self.sheet.shape[1])) + return \ + 'Format: {shape}; filesize: {filesize}; hash: {hash}.'.format( + shape=shape, filesize=self.file.size_string, + hash=self.file.hash_short) + def load(self, **kwargs): # Loading file into dataframe if self.file.exist: @@ -271,7 +307,7 @@ if args.efsa_file is None: download(efsa_url, efsa_db.file.path) efsa_db.file.suggest(args.efsa_file) else: - if not efsa_db.file.md5_hash() == args.efsa_checksum: + if not efsa_db.file.hash == args.efsa_checksum: # Hash does not match: download download(efsa_url, efsa_db.file.path) efsa_db.file.suggest(args.efsa_file) @@ -300,7 +336,7 @@ efsa_median_version = pd.read_excel( # args.print_verbosity(1, 'Input file : {file}, {version}, {props}'.format( file=efsa_db.file.path, version=efsa_median_version, - props=efsa_db.sheet.mcra.report)) + props=efsa_db.properties())) # High verbosity, dump data. args.print_verbosity(3, efsa_db.sheet) # @@ -323,7 +359,7 @@ for key, val in data.items(): # args.print_verbosity(1, 'Input file : {file}, {props}'.format( file=data[key].file.path, - props=data[key].sheet.mcra.report)) + props=data[key].properties())) # High verbosity, dump data. args.print_verbosity(3, data[key].sheet) # @@ -538,16 +574,28 @@ Conversion run details * '''+efsa_median_version+r''' * '''+efsa_db.sheet.mcra.report+r''' * Modified: '''+efsa_db.file.modified+r''' + * File size: '''+str(efsa_db.file.size) \ + + r''' B ('''+efsa_db.file.size_string+r''') + * Hash: '''+efsa_db.file.hash+r''' * Other input files: * '''+print_as_link(data['processing_translation'].file.path)+r''' * '''+data['processing_translation'].sheet.mcra.report+r''' * Modified: '''+data['processing_translation'].file.modified+r''' + * File size: '''+str(data['processing_translation'].file.size) \ + + r''' B ('''+data['processing_translation'].file.size_string+r''') + * Hash: '''+data['processing_translation'].file.hash+r''' * '''+print_as_link(data['food_translation'].file.path)+r''' * '''+data['food_translation'].sheet.mcra.report+r''' * Modified: '''+data['food_translation'].file.modified+r''' + * File size: '''+str(data['food_translation'].file.size) \ + + r''' B ('''+data['food_translation'].file.size_string+r''') + * Hash: '''+data['food_translation'].file.hash+r''' * '''+print_as_link(data['processing_type'].file.path)+r''' * '''+data['processing_type'].sheet.mcra.report+r''' * Modified: '''+data['processing_type'].file.modified+r''' + * File size: '''+str(data['processing_type'].file.size) \ + + r''' B ('''+data['processing_type'].file.size_string+r''') + * Hash: '''+data['processing_type'].file.hash+r''' * Output files: * '''+print_as_link(args.processing_factor_file)+r''' * '''+processing_factor.mcra.report+r'''