From eb2fb65345d250b6ae3d9e5f5430bed64e239f44 Mon Sep 17 00:00:00 2001 From: Hans van den Heuvel <hans1.vandenheuvel@wur.nl> Date: Wed, 26 Feb 2020 15:05:39 +0100 Subject: [PATCH] Created objects to deal with files; moved input files to Input folder. --- Convert-EUProcessingFactorsDB/.gitignore | 6 +- .../Convert-EUProcessingFactorsDB.py | 180 +++++++++++++----- .../{ => Input}/FoodTranslations.csv | 0 .../{ => Input}/ProcTypeTranslations.csv | 0 .../{ => Input}/ProcessingTypes.csv | 0 5 files changed, 142 insertions(+), 44 deletions(-) rename Convert-EUProcessingFactorsDB/{ => Input}/FoodTranslations.csv (100%) rename Convert-EUProcessingFactorsDB/{ => Input}/ProcTypeTranslations.csv (100%) rename Convert-EUProcessingFactorsDB/{ => Input}/ProcessingTypes.csv (100%) diff --git a/Convert-EUProcessingFactorsDB/.gitignore b/Convert-EUProcessingFactorsDB/.gitignore index f8444f7..efa6882 100644 --- a/Convert-EUProcessingFactorsDB/.gitignore +++ b/Convert-EUProcessingFactorsDB/.gitignore @@ -7,4 +7,8 @@ ProcessingFactors.xlsx Mismatches.csv Report.md debug_dump_file.xlsx -EU_Processing_Factors_db_P.xlsx.xlsx \ No newline at end of file +EU_Processing_Factors_db_P.xlsx.xlsx + +# Dirs +# Input/ +Output/ \ No newline at end of file diff --git a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py index 4f826f3..c17d3e3 100644 --- a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py +++ b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py @@ -19,6 +19,56 @@ import numpy as np # For debugging purposes # from objbrowser import browse + +# A class to work with the files more streamlined. +# Contains technical details just to use the files in a simple manner. +class MCRAFile: + def __init__(self, default_name, default_dir): + self.default_name = default_name + self.default_dir = default_dir + self.path = None + self.suggested = None + self.exist = False + self.modified = '' + self.extension = None + + def suggest(self, name): + # This is the filename the user suggests on the command line. + # It has to be changed (perhaps) to a proper path + # e.g. if the user only gave a directory + self.suggested = name + head, tail = os.path.split(self.suggested) + if os.path.isdir(self.suggested): + # It is an explicit given directory. + # Use that one, with the default filename + self.path = os.path.join( + self.suggested, self.default_name) + elif tail == self.suggested: + # It is just a filename, no further paths involved. + # Then, and only then, use the default directory + self.path = os.path.join( + self.default_dir, self.suggested) + else: + # No further messing around, just use it: + self.path = self.suggested + # Finally, check for existance + if os.path.exists(self.path) & os.path.isfile(self.path): + self.exist = True + self.modified = time.ctime(os.path.getmtime(self.path)) + base, ext = os.path.splitext(self.path) + self.extension = ext + + # Returns an MD5 hash of the file; file will be processed + def md5_hash(self): + md5_hash = hashlib.md5() + with open(self.path, "rb") as f: + # Read and update hash in chunks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + md5_hash.update(byte_block) + close(self.path) + return md5_hash.hexdigest() + + # We want some additional functions, let's extent the panda object model, # to easily access these methods. @pd.api.extensions.register_dataframe_accessor('mcra') @@ -47,6 +97,52 @@ class McraAccessor: # End of class +class MCRAWorksheet: + # This is just a container for file properties and the pandas sheet. + def __init__(self, default_name, default_dir): + self.file = MCRAFile( + default_name=default_name, default_dir=default_dir) + self.sheet = None + + def load(self, **kwargs): + # Loading file into dataframe + if self.file.exist: + if self.file.extension == '.csv': + # Some defaults: + if 'sep' not in kwargs: + kwargs['sep'] = ',' + if 'header' not in kwargs: + kwargs['header'] = 0 + if 'comment' not in kwargs: + kwargs['comment'] = '#' + if 'dtype' not in kwargs: + kwargs['dtype'] = str + # Now, ready to go! + self.sheet = pd.read_csv(self.file.path, **kwargs) + elif self.file.extension == '.xlsx': + self.sheet = pd.read_excel(self.file.path, **kwargs) + else: + # Error here + print(' COULD NOT READ {file}'.format(file=self.file.path)) + + +# These are the files we work with +processing_type = MCRAWorksheet( + default_name='ProcessingTypes.csv', + default_dir='Input') +processing_translation = MCRAWorksheet( + default_name='ProcTypeTranslations.csv', + default_dir='Input') +food_translation = MCRAWorksheet( + default_name='FoodTranslations.csv', + default_dir='Input') +processing_factor = MCRAWorksheet( + default_name='ProcessingFactors.zip', + default_dir='Output') +efsa_db = MCRAWorksheet( + default_name='EU_Processing_Factors_db_P.xlsx.xlsx', + default_dir='Input') + # Some info text here parser = argparse.ArgumentParser( description='Converts the EFSA Zendono Excel sheet into an MCRA ' + @@ -60,15 +156,18 @@ parser = argparse.ArgumentParser( # URL: https://zenodo.org/record/1488653/files/EU_Processing_Factors_db_P.xlsx.xlsx?download=1 # on page https://zenodo.org/record/1488653#.Xk_cy0oo-Um parser.add_argument( - '-t', '--processing_type_file', default='ProcessingTypes.csv', + '-t', '--processing_type_file', + default=processing_type.file.default_name, help='The (input) processing type file - ' + 'format: csv (Comma Seperated). (default: %(default)s)') parser.add_argument( - '-p', '--processing_translation_file', default='ProcTypeTranslations.csv', + '-p', '--processing_translation_file', + default=processing_translation.file.default_name, help='The (input) processing translation file - ' + 'format: csv (Comma Seperated). (default: %(default)s)') parser.add_argument( - '-f', '--food_translation_file', default='FoodTranslations.csv', + '-f', '--food_translation_file', + default=food_translation.file.default_name, help='The (input) food translation file - ' + 'format: csv (Comma Seperated). (default: %(default)s)') parser.add_argument( @@ -101,7 +200,6 @@ parser.add_argument( '-c', '--efsa_median_pfs_checksum', default='f816bf3928431d54f9d15fb134cc9106', help=argparse.SUPPRESS) - # Done configuring, let the parser do his thing args = parser.parse_args() @@ -188,47 +286,46 @@ args.print_verbosity(3, efsa_median_pfs) ############################### # Second file # -args.print_verbosity( - 2, ' [READING] Processing translation input file: {file}'.format( - file=args.processing_translation_file)) +# Evaluate the user request for the filename +processing_translation.file.suggest( + args.processing_translation_file) +# +processing_translation.load() # -processing_translation = pd.read_csv( - args.processing_translation_file, sep=',', header=0, comment='#', - dtype={'FromFC': 'str', 'FCToProcType': 'str'}) args.print_verbosity(1, 'Input file : {file}, {props}'.format( - file=args.processing_translation_file, - props=processing_translation.mcra.report)) + file=processing_translation.file.path, + props=processing_translation.sheet.mcra.report)) # High verbosity, dump data. -args.print_verbosity(3, processing_translation) +args.print_verbosity(3, processing_translation.sheet) # ############################### # Third file # -args.print_verbosity( - 2, ' [READING] Food translation input file: {file}'.format( - file=args.food_translation_file)) +# Evaluate the user request for the filename +food_translation.file.suggest( + args.food_translation_file) +# +food_translation.load() # -food_translation = pd.read_csv( - args.food_translation_file, sep=',', header=0, comment='#', - dtype={'FromFX': 'str', 'FXToRpc': 'str', 'FXToProcType': 'str'}) args.print_verbosity(1, 'Input file : {file}, {props}'.format( - file=args.food_translation_file, props=food_translation.mcra.report)) + file=food_translation.file.path, + props=food_translation.sheet.mcra.report)) # High verbosity, dump data. args.print_verbosity(3, food_translation) ############################### # Fourth file # -args.print_verbosity( - 2, ' [READING] Processing type input file: {file}'.format( - file=args.processing_type_file)) +# Evaluate the user request for the filename +processing_type.file.suggest( + args.processing_type_file) +# +processing_type.load() # -processing_type = pd.read_csv( - args.processing_type_file, sep=',', header=0, comment='#', - dtype={'idProcessingType': 'str'}) args.print_verbosity(1, 'Input file : {file}, {props}'.format( - file=args.processing_type_file, props=processing_type.mcra.report)) + file=processing_type.file.path, + props=processing_type.sheet.mcra.report)) # High verbosity, dump data. args.print_verbosity(3, processing_type) @@ -245,11 +342,11 @@ args.print_verbosity(2, '[PHASE 2] Processing data.') # Then we have all data in one single dataframe (table). efsa_combined = efsa_median_pfs.merge( # Left join on all the rows from the EFSA sheet - # that have a Keyfacets Code in processing_translation - processing_translation, left_on='KeyFacets Code', right_on='FromFC', + # that have a Keyfacets Code in processing_translation.sheet + processing_translation.sheet, left_on='KeyFacets Code', right_on='FromFC', how='left').merge( # Left join that have a Keyfacets Code in processing_translation - food_translation, left_on='Matrix FoodEx2 Code', right_on='FromFX', + food_translation.sheet, left_on='Matrix FoodEx2 Code', right_on='FromFX', how='left').assign( # And a new column with the combination # of the Matrix Code and the Processing Type @@ -292,7 +389,7 @@ efsa_combined.loc[ # So, again, a left join :-) efsa_combined = efsa_combined.merge( # Left join with processing type sheet, - processing_type, left_on='idProcessingType', right_on='idProcessingType', + processing_type.sheet, left_on='idProcessingType', right_on='idProcessingType', how='left').assign() # Copy column efsa_combined.mcra.copycolumn( @@ -444,18 +541,15 @@ Conversion run details * '''+efsa_median_pfs.mcra.report+r''' * Modified: '''+time.ctime(os.path.getmtime(args.efsa_median_pfs))+r''' * Other input files: - * '''+print_as_link(args.processing_translation_file)+r''' - * '''+processing_translation.mcra.report+r''' - * Modified: '''+time.ctime( - os.path.getmtime(args.processing_translation_file))+r''' - * '''+print_as_link(args.food_translation_file)+r''' - * '''+food_translation.mcra.report+r''' - * Modified: '''+time.ctime( - os.path.getmtime(args.food_translation_file))+r''' - * '''+print_as_link(args.processing_type_file)+r''' - * '''+processing_type.mcra.report+r''' - * Modified: '''+time.ctime( - os.path.getmtime(args.processing_type_file))+r''' + * '''+print_as_link(processing_translation.file.path)+r''' + * '''+processing_translation.sheet.mcra.report+r''' + * Modified: '''+processing_translation.file.modified+r''' + * '''+print_as_link(food_translation.file.path)+r''' + * '''+food_translation.sheet.mcra.report+r''' + * Modified: '''+food_translation.file.modified+r''' + * '''+print_as_link(processing_type.file.path)+r''' + * '''+processing_type.sheet.mcra.report+r''' + * Modified: '''+processing_type.file.modified+r''' * Output files: * '''+print_as_link(args.processing_factor_file)+r''' * '''+processing_factor.mcra.report+r''' diff --git a/Convert-EUProcessingFactorsDB/FoodTranslations.csv b/Convert-EUProcessingFactorsDB/Input/FoodTranslations.csv similarity index 100% rename from Convert-EUProcessingFactorsDB/FoodTranslations.csv rename to Convert-EUProcessingFactorsDB/Input/FoodTranslations.csv diff --git a/Convert-EUProcessingFactorsDB/ProcTypeTranslations.csv b/Convert-EUProcessingFactorsDB/Input/ProcTypeTranslations.csv similarity index 100% rename from Convert-EUProcessingFactorsDB/ProcTypeTranslations.csv rename to Convert-EUProcessingFactorsDB/Input/ProcTypeTranslations.csv diff --git a/Convert-EUProcessingFactorsDB/ProcessingTypes.csv b/Convert-EUProcessingFactorsDB/Input/ProcessingTypes.csv similarity index 100% rename from Convert-EUProcessingFactorsDB/ProcessingTypes.csv rename to Convert-EUProcessingFactorsDB/Input/ProcessingTypes.csv -- GitLab