Commit c5db08dd authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Changed outputstructure; all files treated the same now.

parent 1849eade
......@@ -6,6 +6,7 @@
import mcra
import pandas as pd
from datetime import datetime
import textwrap
# Small utility to create hyperlink to hyperlink :-)
......@@ -96,13 +97,9 @@ dataset.add(
#
dataset.add(
name='report',
default_name='Report.md',
default_name='Report.csv',
default_dir='Output')
#
dataset.add(
name='mismatches',
default_name='Mismatches.csv',
default_dir='Output')
#############################################################################
......@@ -250,6 +247,7 @@ dataset.processing_factor.sheet = efsa_combined[
#
# Writing output file
dataset.processing_factor.save()
dataset.processing_factor.close()
# Don't close file yet; if it is a zipfile we still want to enclose a report.
#############################################################################
......@@ -299,14 +297,9 @@ mismatch_table_string = report_sheet[
header = ['Matrix FoodEx2 Code', 'Matrix Code Interpreted', 'Matrix Code',
'KeyFacets Code', 'KeyFacets Interpreted',
'Number of Matrix FoodEx2 Codes', 'Number of KeyFacets Codes']
dataset.mismatches.sheet = report_sheet[header]
dataset.mismatches.save()
dataset.mismatches.close()
dataset.verbose(1, 'Output file: {file}; {props}'.format(
file=dataset.mismatches.file.path, props=dataset.mismatches.properties))
dataset.report.sheet = report_sheet[header]
#
dataset.report.sheet = r'''
dataset.report.report = r'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
......@@ -315,37 +308,20 @@ Conversion run details
* Date: '''+datetime.now().strftime('%H:%M:%S, %d %b %Y')+r'''
* Files:
* Excel input: '''+print_as_link(dataset.efsa.file.path)+r'''
* '''+efsa_version+r'''
* '''+dataset.efsa.properties+r'''
* Modified: '''+dataset.efsa.file.modified+r'''
* File size: '''+str(dataset.efsa.file.size) \
+ r''' B ('''+dataset.efsa.file.size_string+r''')
* Hash: '''+dataset.efsa.file.hash+r'''
* Other input files:
* '''+print_as_link(dataset.processing_translation.file.path)+r'''
* '''+dataset.processing_translation.properties+r'''
* Modified: '''+dataset.processing_translation.file.modified+r'''
* File size: '''+str(dataset.processing_translation.file.size) \
+ r''' B ('''+dataset.processing_translation.file.size_string+r''')
* Hash: '''+dataset.processing_translation.file.hash+r'''
* '''+print_as_link(dataset.food_translation.file.path)+r'''
* '''+dataset.food_translation.properties+r'''
* Modified: '''+dataset.food_translation.file.modified+r'''
* File size: '''+str(dataset.food_translation.file.size) \
+ r''' B ('''+dataset.food_translation.file.size_string+r''')
* Hash: '''+dataset.food_translation.file.hash+r'''
* '''+print_as_link(dataset.processing_type.file.path)+r'''
* '''+dataset.processing_type.properties+r'''
* Modified: '''+dataset.processing_type.file.modified+r'''
* File size: '''+str(dataset.processing_type.file.size) \
+ r''' B ('''+dataset.processing_type.file.size_string+r''')
* Hash: '''+dataset.processing_type.file.hash+r'''
* Output files:
* '''+print_as_link(dataset.processing_factor.file.path)+r'''
* '''+dataset.processing_factor.properties+r'''
* Modified: '''+dataset.processing_factor.file.modified+r'''
'''
for data in dataset:
if data.direction == 'Input':
dataset.report.report = textwrap.indent(data.report, mcra.PY_INDENT)
for datasetname in dataset.list:
# Bit of a hack, figure out later how this can be properly done.
if getattr(dataset, datasetname).direction == 'Output' \
and datasetname != 'report':
dataset.report.report = textwrap.indent(
getattr(dataset, datasetname).report, mcra.PY_INDENT)
dataset.report.report = r'''
EFSA Excel input details
========================
......@@ -386,28 +362,8 @@ Below a list with the most (more than '''+str(
'''+mismatch_table_string+r'''
'''
dataset.report.save()
dataset.report.close()
dataset.verbose(
1, 'Output file: {file}; {props}'.format(
file=dataset.report.file.path, props=dataset.report.properties))
if dataset.processing_factor.file.extension == '.zip':
# Now, let's sneak in the report before we save
filename_in_zip = 'README.md'
dataset.processing_factor.add_file(
dataset.report.file.path, filename_in_zip)
dataset.processing_factor.close()
dataset.verbose(
1,
'Output file: {f}; {p} {r} enclosed in zipfile as {zf}.'.format(
f=dataset.processing_factor.file.path,
p=dataset.processing_factor.properties,
r=dataset.report.file.path,
zf=filename_in_zip))
else:
dataset.processing_factor.close()
dataset.verbose(
1, 'Output file: {file}; {props}'.format(
file=dataset.processing_factor.file.path,
props=dataset.processing_factor.properties))
dataset.report.close(auto_report=False)
dataset.close()
......@@ -5,18 +5,19 @@ from urllib.parse import urlparse
import os # path, mkdir, walk
import time # ctime
import types
import zipfile
import tempfile
import uuid
import shutil
import zipfile
import requests
import hashlib
import numpy as np
import math
import sys
import textwrap
# For debugging purposes
# from objbrowser import browse
PY_INDENT = ' '
@pd.api.extensions.register_dataframe_accessor('mcra')
class McraAccessor:
......@@ -87,6 +88,7 @@ class DataFile:
self.default_base = os.path.splitext(self.default_name)[0]
self.default_dir = default_dir
self.path = None
self.reportpath = None
self.zippath = None
self.suggested = None
self.exist = False
......@@ -176,6 +178,7 @@ class DataFile:
head, tail = os.path.split(self.path)
self.path = os.path.join(force_dir, tail)
base, ext = os.path.splitext(self.path)
self.reportpath = base+'.md'
self.extension = ext
self.update()
......@@ -191,52 +194,55 @@ class DataSheet:
autoload=True):
self.file = file
self.sheet = None
self.type = None
self.checksum = checksum
self.properties = ''
self.report = ''
self._report = ''
self.direction = direction
self.autoload = autoload
def add_reportline(self, line, newline=True):
self.report = self.report+line
if newline:
self.report = self.report+'\n'
@property
def report(self):
return self._report
@report.setter
def report(self, report):
'''
Setting the report property adds text
'''
self._report = self._report+report
def make_report(self):
temp = self.report
self.report = ''
self.add_reportline(
'* {dir} file: {file}'.format(
dir=self.direction,
file=os.path.split(self.file.path)[1]))
self.add_reportline(
' * [{path}]({path})'.format(path=self.file.path))
self.add_reportline(
' * {props}'.format(props=self.properties))
self.add_reportline(
' * Modified: {mod}'.format(mod=self.file.modified))
self.add_reportline(
' * File size: {size_str} ({size} B)'.format(
size_str=self.file.size_string, size=self.file.size))
self.add_reportline(
' * Hash: {hash}'.format(hash=self.file.hash))
self.report = self.report+temp
temp = self._report
report = ''
report += '* {dir} file: {file}\n'.format(
dir=self.direction,
file=os.path.split(self.file.path)[1])
report += textwrap.indent(
'* [{path}]({path})\n'.format(path=self.file.path), PY_INDENT)
report += textwrap.indent(
'* {props}\n'.format(props=self.properties), PY_INDENT)
report += textwrap.indent(
'* Modified: {mod}\n'.format(mod=self.file.modified), PY_INDENT)
report += textwrap.indent(
'* File size: {size_str} ({size} B)\n'.format(
size_str=self.file.size_string, size=self.file.size),
PY_INDENT)
report += textwrap.indent(
'* Hash: {hash}\n\n'.format(hash=self.file.hash), PY_INDENT)
self._report = report+temp
def clear_report(self):
self._report = ''
def update_properties(self):
if self.type == 'pandas':
if self.sheet is not None:
shape = '[{rows} rows x {columns} columns]'.format(
rows=str(self.sheet.shape[0]),
columns=str(self.sheet.shape[1]))
self.properties = \
'Format: {sh}; filesize: {fs}; hash: {h}.'.format(
sh=shape, fs=self.file.size_string,
h=self.file.hash_short)
elif self.type == 'markdown':
self.properties = 'Filesize: {filesize}; hash: {hash}.'.format(
filesize=self.file.size_string,
hash=self.file.hash_short)
if self.sheet is not None:
shape = '[{rows} rows x {columns} columns]'.format(
rows=str(self.sheet.shape[0]),
columns=str(self.sheet.shape[1]))
self.properties = \
'Format: {sh}; filesize: {fs}; hash: {h}.'.format(
sh=shape, fs=self.file.size_string,
h=self.file.hash_short)
@staticmethod
def supply_defaults(default, **kwargs):
......@@ -272,77 +278,43 @@ class DataSheet:
def save(self, **kwargs):
if self.file.extension == '.csv':
if self.type is None:
self.type = 'pandas'
kwargs = self.supply_defaults(
{'index': False}, **kwargs)
self.sheet.to_csv(
path_or_buf=self.file.path,
**kwargs)
elif self.file.extension == '.tsv':
if self.type is None:
self.type = 'pandas'
kwargs = self.supply_defaults(
{'index': False, 'sep': '\t'}, **kwargs)
self.sheet.to_csv(
path_or_buf=self.file.path,
**kwargs)
elif self.file.extension == '.xlsx':
if self.type is None:
self.type = 'pandas'
kwargs = self.supply_defaults(
{'index': False}, **kwargs)
self.sheet.to_excel(
self.file.path,
sheet_name=self.file.default_base,
**kwargs)
elif self.file.extension == '.zip':
# A zip file, by definition will contain the csv.
# First we need to create a tempdir, securely.
if self.type is None:
self.type = 'pandas'
if self.file.zippath is None:
tempfile.mkdtemp()
self.file.zippath = os.path.join(
tempfile.gettempdir(), str(uuid.uuid4()))
os.mkdir(self.file.zippath)
# There we write the .csv:
out_file = os.path.join(
self.file.zippath, self.file.default_base+'.csv')
kwargs = self.supply_defaults(
{'index': False}, **kwargs)
self.sheet.to_csv(
path_or_buf=out_file, **kwargs)
elif self.file.extension == '.md':
if self.type is None:
self.type = 'markdown'
f = open(self.file.path, 'w+')
f.write(self.sheet)
f.close()
self.update_properties()
def close(self):
if self.file.extension == '.zip':
zip = zipfile.ZipFile(self.file.path, 'w')
for dirname, subdirs, files in os.walk(self.file.zippath):
for filename in files:
zip.write(os.path.join(dirname, filename), filename)
zip.close()
shutil.rmtree(self.file.zippath)
self.file.zippath = None
def close(self, auto_report=True):
'''
If auto_report is False, no report on the object will me made.
If the report contains no content, it will not be created as file.
If however, you added something to the report, it WILL be created.
'''
self.file.update()
self.update_properties()
self.make_report()
def add_file(self, path, to=None):
if self.file.extension == '.zip':
if to is None:
shutil.copyfile(path,
os.path.join(self.file.zippath,
os.path.split(path)[1]))
else:
shutil.copyfile(path,
os.path.join(self.file.zippath, to))
if auto_report:
self.make_report()
if len(self.report) > 0:
# Save report
with open(self.file.reportpath, 'w+') as f:
f.write(self.report)
print('Output file: {file}; {props}'.format(
file=self.file.path,
props=self.properties))
class DataSet:
......@@ -361,11 +333,21 @@ class DataSet:
if '-v' in sys.argv or '--verbosity' in sys.argv:
print(opening)
self.list = []
# Whether or not to create a zip file
self.zip = None
# It is usefull to be able to iterate over all the datasheets.
# Basically, avoid using .list. in all DataSet references.
def __iter__(self):
return iter(self.list)
self.n = 0
return self
def __next__(self):
if self.n < len(self.list):
self.n = self.n + 1
return getattr(self, self.list[self.n-1])
else:
raise StopIteration
# Container for all the files
def add(self, name, default_name, default_dir, short_argument=None,
......@@ -454,7 +436,6 @@ class DataSet:
if dataset.file.necessary:
print('File {file} not found.'.format(
file=dataset.file.path))
dataset.type = 'pandas'
if dataset.autoload:
if getattr(self.args, datasetname+'_file') is None \
......@@ -474,16 +455,36 @@ class DataSet:
# High verbosity, dump data.
self.verbose(3, dataset.sheet)
else:
dataset.file.suggest(datasetfilename)
if dataset.file.extension == '.md':
dataset.type = 'markdown'
# It is an Ouput file
base, ext = os.path.splitext(datasetfilename)
if ext == '.zip':
# In case of zip file, we will make a csv
datasetfilename = base + '.csv'
# and also put everything into a zip file
dataset.file.suggest(datasetfilename)
dataset.update_properties()
basezip, extzip = os.path.splitext(dataset.file.path)
self.zip = basezip+'.zip'
else:
dataset.type = 'pandas'
dataset.update_properties()
dataset.file.suggest(datasetfilename)
dataset.update_properties()
def close(self):
'''
Method to close the dataset.
Most importantly save files.
'''
pass
if self.zip:
# All output files will be added to the zip file
self.verbose(
1,
'Output file: {file}, containing all output.'.format(
file=self.zip))
zip = zipfile.ZipFile(self.zip, 'w')
for data in self:
if data.direction == 'Output':
filename = os.path.split(data.file.reportpath)[1]
zip.write(data.file.reportpath, filename)
filename = os.path.split(data.file.path)[1]
zip.write(data.file.path, filename)
zip.close()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment