From eb2fb65345d250b6ae3d9e5f5430bed64e239f44 Mon Sep 17 00:00:00 2001
From: Hans van den Heuvel <hans1.vandenheuvel@wur.nl>
Date: Wed, 26 Feb 2020 15:05:39 +0100
Subject: [PATCH] Created objects to deal with files; moved input files to
 Input folder.

---
 Convert-EUProcessingFactorsDB/.gitignore      |   6 +-
 .../Convert-EUProcessingFactorsDB.py          | 180 +++++++++++++-----
 .../{ => Input}/FoodTranslations.csv          |   0
 .../{ => Input}/ProcTypeTranslations.csv      |   0
 .../{ => Input}/ProcessingTypes.csv           |   0
 5 files changed, 142 insertions(+), 44 deletions(-)
 rename Convert-EUProcessingFactorsDB/{ => Input}/FoodTranslations.csv (100%)
 rename Convert-EUProcessingFactorsDB/{ => Input}/ProcTypeTranslations.csv (100%)
 rename Convert-EUProcessingFactorsDB/{ => Input}/ProcessingTypes.csv (100%)

diff --git a/Convert-EUProcessingFactorsDB/.gitignore b/Convert-EUProcessingFactorsDB/.gitignore
index f8444f7..efa6882 100644
--- a/Convert-EUProcessingFactorsDB/.gitignore
+++ b/Convert-EUProcessingFactorsDB/.gitignore
@@ -7,4 +7,8 @@ ProcessingFactors.xlsx
 Mismatches.csv
 Report.md
 debug_dump_file.xlsx
-EU_Processing_Factors_db_P.xlsx.xlsx
\ No newline at end of file
+EU_Processing_Factors_db_P.xlsx.xlsx
+
+# Dirs
+# Input/
+Output/
\ No newline at end of file
diff --git a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
index 4f826f3..c17d3e3 100644
--- a/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
+++ b/Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
@@ -19,6 +19,56 @@ import numpy as np
 # For debugging purposes
 # from objbrowser import browse
 
+
+# A class to work with the files more streamlined.
+# Contains technical details just to use the files in a simple manner.
+class MCRAFile:
+    def __init__(self, default_name, default_dir):
+        self.default_name = default_name
+        self.default_dir = default_dir
+        self.path = None
+        self.suggested = None
+        self.exist = False
+        self.modified = ''
+        self.extension = None
+
+    def suggest(self, name):
+        # This is the filename the user suggests on the command line.
+        # It has to be changed (perhaps) to a proper path
+        # e.g. if the user only gave a directory
+        self.suggested = name
+        head, tail = os.path.split(self.suggested)
+        if os.path.isdir(self.suggested):
+            # It is an explicit given directory.
+            # Use that one, with the default filename
+            self.path = os.path.join(
+                self.suggested, self.default_name)
+        elif tail == self.suggested:
+            # It is just a filename, no further paths involved.
+            # Then, and only then, use the default directory
+            self.path = os.path.join(
+                self.default_dir, self.suggested)
+        else:
+            # No further messing around, just use it:
+            self.path = self.suggested
+        # Finally, check for existance
+        if os.path.exists(self.path) & os.path.isfile(self.path):
+            self.exist = True
+            self.modified = time.ctime(os.path.getmtime(self.path))
+            base, ext = os.path.splitext(self.path)
+            self.extension = ext
+
+    # Returns an MD5 hash of the file; file will be processed
+    def md5_hash(self):
+        md5_hash = hashlib.md5()
+        with open(self.path, "rb") as f:
+            # Read and update hash in chunks of 4K
+            for byte_block in iter(lambda: f.read(4096), b""):
+                md5_hash.update(byte_block)
+        close(self.path)
+        return md5_hash.hexdigest()
+
+
 # We want some additional functions, let's extent the panda object model,
 # to easily access these methods.
 @pd.api.extensions.register_dataframe_accessor('mcra')
@@ -47,6 +97,52 @@ class McraAccessor:
 # End of class
 
 
+class MCRAWorksheet:
+    # This is just a container for file properties and the pandas sheet.
+    def __init__(self, default_name, default_dir):
+        self.file = MCRAFile(
+            default_name=default_name, default_dir=default_dir)
+        self.sheet = None
+
+    def load(self, **kwargs):
+        # Loading file into dataframe
+        if self.file.exist:
+            if self.file.extension == '.csv':
+                # Some defaults:
+                if 'sep' not in kwargs:
+                    kwargs['sep'] = ','
+                if 'header' not in kwargs:
+                    kwargs['header'] = 0
+                if 'comment' not in kwargs:
+                    kwargs['comment'] = '#'
+                if 'dtype' not in kwargs:
+                    kwargs['dtype'] = str
+                # Now, ready to go!
+                self.sheet = pd.read_csv(self.file.path, **kwargs)
+            elif self.file.extension == '.xlsx':
+                self.sheet = pd.read_excel(self.file.path, **kwargs)
+            else:
+                # Error here
+                print(' COULD NOT READ {file}'.format(file=self.file.path))
+
+
+# These are the files we work with
+processing_type = MCRAWorksheet(
+    default_name='ProcessingTypes.csv',
+    default_dir='Input')
+processing_translation = MCRAWorksheet(
+    default_name='ProcTypeTranslations.csv',
+    default_dir='Input')
+food_translation = MCRAWorksheet(
+    default_name='FoodTranslations.csv',
+    default_dir='Input')
+processing_factor = MCRAWorksheet(
+    default_name='ProcessingFactors.zip',
+    default_dir='Output')
+efsa_db = MCRAWorksheet(
+    default_name='EU_Processing_Factors_db_P.xlsx.xlsx',
+    default_dir='Input')
+
 # Some info text here
 parser = argparse.ArgumentParser(
     description='Converts the EFSA Zendono Excel sheet into an MCRA ' +
@@ -60,15 +156,18 @@ parser = argparse.ArgumentParser(
 #   URL: https://zenodo.org/record/1488653/files/EU_Processing_Factors_db_P.xlsx.xlsx?download=1
 #   on page https://zenodo.org/record/1488653#.Xk_cy0oo-Um
 parser.add_argument(
-    '-t', '--processing_type_file', default='ProcessingTypes.csv',
+    '-t', '--processing_type_file',
+    default=processing_type.file.default_name,
     help='The (input) processing type file - ' +
          'format: csv (Comma Seperated). (default: %(default)s)')
 parser.add_argument(
-    '-p', '--processing_translation_file', default='ProcTypeTranslations.csv',
+    '-p', '--processing_translation_file',
+    default=processing_translation.file.default_name,
     help='The (input) processing translation file - ' +
          'format: csv (Comma Seperated). (default: %(default)s)')
 parser.add_argument(
-    '-f', '--food_translation_file', default='FoodTranslations.csv',
+    '-f', '--food_translation_file',
+    default=food_translation.file.default_name,
     help='The (input) food translation file - ' +
          'format: csv (Comma Seperated). (default: %(default)s)')
 parser.add_argument(
@@ -101,7 +200,6 @@ parser.add_argument(
     '-c', '--efsa_median_pfs_checksum',
     default='f816bf3928431d54f9d15fb134cc9106', help=argparse.SUPPRESS)
 
-
 # Done configuring, let the parser do his thing
 args = parser.parse_args()
 
@@ -188,47 +286,46 @@ args.print_verbosity(3, efsa_median_pfs)
 ###############################
 # Second file
 #
-args.print_verbosity(
-    2, '  [READING] Processing translation input file: {file}'.format(
-        file=args.processing_translation_file))
+# Evaluate the user request for the filename
+processing_translation.file.suggest(
+    args.processing_translation_file)
+#
+processing_translation.load()
 #
-processing_translation = pd.read_csv(
-    args.processing_translation_file, sep=',', header=0, comment='#',
-    dtype={'FromFC': 'str', 'FCToProcType': 'str'})
 args.print_verbosity(1, 'Input file : {file}, {props}'.format(
-    file=args.processing_translation_file,
-    props=processing_translation.mcra.report))
+    file=processing_translation.file.path,
+    props=processing_translation.sheet.mcra.report))
 # High verbosity, dump data.
-args.print_verbosity(3, processing_translation)
+args.print_verbosity(3, processing_translation.sheet)
 #
 
 ###############################
 # Third file
 #
-args.print_verbosity(
-    2, '  [READING] Food translation input file: {file}'.format(
-        file=args.food_translation_file))
+# Evaluate the user request for the filename
+food_translation.file.suggest(
+    args.food_translation_file)
+#
+food_translation.load()
 #
-food_translation = pd.read_csv(
-    args.food_translation_file, sep=',', header=0, comment='#',
-    dtype={'FromFX': 'str', 'FXToRpc': 'str', 'FXToProcType': 'str'})
 args.print_verbosity(1, 'Input file : {file}, {props}'.format(
-    file=args.food_translation_file, props=food_translation.mcra.report))
+    file=food_translation.file.path,
+    props=food_translation.sheet.mcra.report))
 # High verbosity, dump data.
 args.print_verbosity(3, food_translation)
 
 ###############################
 # Fourth file
 #
-args.print_verbosity(
-    2, '  [READING] Processing type input file: {file}'.format(
-        file=args.processing_type_file))
+# Evaluate the user request for the filename
+processing_type.file.suggest(
+    args.processing_type_file)
+#
+processing_type.load()
 #
-processing_type = pd.read_csv(
-    args.processing_type_file, sep=',', header=0, comment='#',
-    dtype={'idProcessingType': 'str'})
 args.print_verbosity(1, 'Input file : {file}, {props}'.format(
-    file=args.processing_type_file, props=processing_type.mcra.report))
+    file=processing_type.file.path,
+    props=processing_type.sheet.mcra.report))
 # High verbosity, dump data.
 args.print_verbosity(3, processing_type)
 
@@ -245,11 +342,11 @@ args.print_verbosity(2, '[PHASE 2] Processing data.')
 # Then we have all data in one single dataframe (table).
 efsa_combined = efsa_median_pfs.merge(
     # Left join on all the rows from the EFSA sheet
-    # that have a Keyfacets Code in processing_translation
-    processing_translation, left_on='KeyFacets Code', right_on='FromFC',
+    # that have a Keyfacets Code in processing_translation.sheet
+    processing_translation.sheet, left_on='KeyFacets Code', right_on='FromFC',
     how='left').merge(
     # Left join that have a Keyfacets Code in processing_translation
-    food_translation, left_on='Matrix FoodEx2 Code', right_on='FromFX',
+    food_translation.sheet, left_on='Matrix FoodEx2 Code', right_on='FromFX',
     how='left').assign(
     # And a new column with the combination
     # of the Matrix Code and the Processing Type
@@ -292,7 +389,7 @@ efsa_combined.loc[
 # So, again, a left join :-)
 efsa_combined = efsa_combined.merge(
     # Left join with processing type sheet,
-    processing_type, left_on='idProcessingType', right_on='idProcessingType',
+    processing_type.sheet, left_on='idProcessingType', right_on='idProcessingType',
     how='left').assign()
 # Copy column
 efsa_combined.mcra.copycolumn(
@@ -444,18 +541,15 @@ Conversion run details
         * '''+efsa_median_pfs.mcra.report+r'''
         * Modified: '''+time.ctime(os.path.getmtime(args.efsa_median_pfs))+r'''
     * Other input files:
-        * '''+print_as_link(args.processing_translation_file)+r'''
-            * '''+processing_translation.mcra.report+r'''
-            * Modified: '''+time.ctime(
-                os.path.getmtime(args.processing_translation_file))+r'''
-        * '''+print_as_link(args.food_translation_file)+r'''
-            * '''+food_translation.mcra.report+r'''
-            * Modified: '''+time.ctime(
-                os.path.getmtime(args.food_translation_file))+r'''
-        * '''+print_as_link(args.processing_type_file)+r'''
-            * '''+processing_type.mcra.report+r'''
-            * Modified: '''+time.ctime(
-                os.path.getmtime(args.processing_type_file))+r'''
+        * '''+print_as_link(processing_translation.file.path)+r'''
+            * '''+processing_translation.sheet.mcra.report+r'''
+            * Modified: '''+processing_translation.file.modified+r'''
+        * '''+print_as_link(food_translation.file.path)+r'''
+            * '''+food_translation.sheet.mcra.report+r'''
+            * Modified: '''+food_translation.file.modified+r'''
+        * '''+print_as_link(processing_type.file.path)+r'''
+            * '''+processing_type.sheet.mcra.report+r'''
+            * Modified: '''+processing_type.file.modified+r'''
     * Output files:
         * '''+print_as_link(args.processing_factor_file)+r'''
             * '''+processing_factor.mcra.report+r'''
diff --git a/Convert-EUProcessingFactorsDB/FoodTranslations.csv b/Convert-EUProcessingFactorsDB/Input/FoodTranslations.csv
similarity index 100%
rename from Convert-EUProcessingFactorsDB/FoodTranslations.csv
rename to Convert-EUProcessingFactorsDB/Input/FoodTranslations.csv
diff --git a/Convert-EUProcessingFactorsDB/ProcTypeTranslations.csv b/Convert-EUProcessingFactorsDB/Input/ProcTypeTranslations.csv
similarity index 100%
rename from Convert-EUProcessingFactorsDB/ProcTypeTranslations.csv
rename to Convert-EUProcessingFactorsDB/Input/ProcTypeTranslations.csv
diff --git a/Convert-EUProcessingFactorsDB/ProcessingTypes.csv b/Convert-EUProcessingFactorsDB/Input/ProcessingTypes.csv
similarity index 100%
rename from Convert-EUProcessingFactorsDB/ProcessingTypes.csv
rename to Convert-EUProcessingFactorsDB/Input/ProcessingTypes.csv
-- 
GitLab