Commit 0c46b68b authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Added option to report duplicates in substances assignment.

parent c5db08dd
......@@ -97,7 +97,7 @@ dataset.add(
#
dataset.add(
name='report',
default_name='Report.csv',
default_name='Report.xlsx',
default_dir='Output')
#
......@@ -248,7 +248,10 @@ dataset.processing_factor.sheet = efsa_combined[
# Writing output file
dataset.processing_factor.save()
dataset.processing_factor.close()
# Don't close file yet; if it is a zipfile we still want to enclose a report.
# In case of debugging, just dump the sheet we've been working on.
if dataset.args.verbosity > 3:
efsa_combined.mcra.dump('.\Output\dump.xlsx')
#############################################################################
# Phase 4. Report about the data.
......@@ -299,6 +302,31 @@ header = ['Matrix FoodEx2 Code', 'Matrix Code Interpreted', 'Matrix Code',
'Number of Matrix FoodEx2 Codes', 'Number of KeyFacets Codes']
dataset.report.sheet = report_sheet[header]
#
# We also need some further text reporting:
# Let's make a new column of the combination of 'idSubstance' and 'idFoodUnProcessed'
mismatch_table = efsa_combined[
(efsa_combined['FCToProcType'].notna() |
efsa_combined['FXToProcType'].notna()) &
efsa_combined['idSubstance'].notna()]
mismatch_table = mismatch_table.mcra.join(
name='idSubstanceFoodProc',
join_left='idSubstance',
join_right='idFoodUnProcessed')
double_types = mismatch_table.groupby(
['idProcessingType', 'idSubstanceFoodProc'],
as_index=False).agg(
{'idSubstance' : 'first',
'idFoodUnProcessed' : 'first',
'FoodUnprocessedName' : 'first',
'KeyFacets Interpreted' : 'first',
'Matrix Code Interpreted' : 'first',
'MCRA_ProcessingType_Description' : 'first'}).drop(
'idSubstanceFoodProc', axis=1)
# report_sheet = mismatch_table.groupby(
# ['KeyFacets Code', 'Matrix FoodEx2 Code']).size()
# #
dataset.report.report = r'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
......@@ -361,9 +389,19 @@ Below a list with the most (more than '''+str(
'''+mismatch_table_string+r'''
Substance conversion duplicates
===============================
'''+double_types.to_markdown(showindex=False)+r'''
'''
dataset.report.save()
# Save this also to the dataset sheet.
with pd.ExcelWriter(dataset.report.file.path, mode='a') as writer:
double_types.to_excel(writer,
index=False,sheet_name='Substances')
dataset.report.close(auto_report=False)
dataset.close()
......@@ -71,6 +71,34 @@ class McraAccessor:
self._obj = self._obj.merge(df, left_index=True, right_index=True)
return self._obj
def join(self, name, join_left, join_right, sep='-'):
'''
joins with another column
'''
# Due to the SettingWithCopyWarning we do it a bit cumbersome
df = pd.DataFrame()
df[[join_left,join_right]] = self._obj[[join_left,join_right]]
df[name] = df.loc[:, (join_left, join_right)].apply(
lambda x: sep.join(x.dropna()), axis=1)
df = df.drop([join_left,join_right], axis=1)
# Not ideal yet, but slightly better than it used to be....
self._obj = self._obj.merge(df, left_index=True, right_index=True)
return self._obj
def dump(self, filename):
'''
For debugging purposes, to dump a file from memory a bit more easily
'''
base, ext = os.path.splitext(filename)
print('Dump file : {file}.'.format(file=filename))
if ext == '.csv':
self._obj.to_csv(path_or_buf=filename, index=False)
elif ext == '.tsv':
self._obj.to_csv(path_or_buf=filename, index=False, sep='\t')
elif ext == '.xlsx':
self._obj.to_excel(filename, sheet_name='Dump', index=False)
class DataFile:
'''
......@@ -453,7 +481,7 @@ class DataSet:
file=dataset.file.path,
props=dataset.properties))
# High verbosity, dump data.
self.verbose(3, dataset.sheet)
# self.verbose(3, dataset.sheet)
else:
# It is an Ouput file
base, ext = os.path.splitext(datasetfilename)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment