Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Biometris
MCRA.DataConversionTools
Commits
63a099d9
Commit
63a099d9
authored
Jul 07, 2021
by
Hans van den Heuvel
Browse files
Created dataconversion; made API more simple
parent
5da6afe7
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Convert-DTUCAG/.gitignore
View file @
63a099d9
...
...
@@ -3,6 +3,4 @@
# User-specific files
# Dirs
Input/
Output/
__pycache__/
\ No newline at end of file
Convert-DTUCAG/Convert-DTUCAG.py
View file @
63a099d9
...
...
@@ -3,7 +3,7 @@
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
import
mcra
import
dataconversion
import
pandas
as
pd
from
datetime
import
datetime
import
textwrap
...
...
@@ -17,7 +17,7 @@ def print_as_link(text):
# These are the files we work with
# Create list
dataset
=
mcra
.
DataSet
(
dataset
=
dataconversion
.
DataSet
(
opening
=
'(c) '
+
datetime
.
now
().
strftime
(
'%Y'
)
+
' Biometris, Wageningen University and Research.'
,
description
=
'Converts the EFSA CAPEG database Excel sheet into MCRA '
...
...
@@ -60,12 +60,11 @@ dataset.add(
default_dir
=
'Output'
)
#
dataset
.
add
(
name
=
'
compound
s'
,
name
=
'
substance
s'
,
short_argument
=
'-s'
,
help
=
'The (output) substances file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'Compounds.zip'
,
# default_name='Compounds.csv',
default_name
=
'Substances.csv'
,
default_dir
=
'Output'
)
#
...
...
@@ -74,27 +73,35 @@ dataset.add(
dataset
.
init
()
# To abbreviate
capeg
=
dataset
.
capeg
.
sheet
# We need to clean up the table firstly
# Remove all CasNumbers with na
capeg
.
drop
(
capeg
.
loc
[
capeg
[
'casNumber'
]
==
'na'
].
index
,
inplace
=
True
)
# FIRST The effects table
# Add the fields for the effects table
capeg
.
mcra
.
addcolumn
(
{
'idEffect'
,
'Name'
,
'Description'
,
'Reference'
})
{
'idEffect'
,
'Name'
,
'Description'
,
'Reference'
,
'targetL1'
})
# Create extra colum for proper CAS1 names
capeg
[
'targetL1'
]
=
capeg
[
'target_CAG1'
].
str
.
split
().
str
[
0
].
str
.
strip
()
capeg
[
'targetL1'
].
replace
(
'Bones'
,
'Skeleton'
,
inplace
=
True
)
capeg
[
'targetL1'
].
replace
(
'Bone'
,
'Bone marrow'
,
inplace
=
True
)
# Create tempcopy
capeg2
=
capeg
.
copy
(
deep
=
True
)
# Fill the idEffectA and idEffectC eg. L1-Liver-Acute
capeg
[
'idEffect'
]
=
'L1-'
+
\
capeg
[
'target
_CAG
1'
].
str
.
split
().
str
[
0
].
str
.
strip
()
+
'-Acute'
capeg
[
'target
L
1'
].
str
.
split
().
str
[
0
].
str
.
strip
()
+
'-Acute'
capeg2
[
'idEffect'
]
=
'L1-'
+
\
capeg2
[
'target
_CAG
1'
].
str
.
split
().
str
[
0
].
str
.
strip
()
+
'-Chronic'
capeg2
[
'target
L
1'
].
str
.
split
().
str
[
0
].
str
.
strip
()
+
'-Chronic'
# Description
capeg
[
'Description'
]
=
'Acute adverse effects on '
+
\
capeg
[
'target
_CAG
1'
].
str
.
lower
()
+
'.'
capeg
[
'target
L
1'
].
str
.
lower
()
+
'.'
capeg2
[
'Description'
]
=
'Chronic adverse effects on '
+
\
capeg2
[
'target
_CAG
1'
].
str
.
lower
()
+
'.'
capeg2
[
'target
L
1'
].
str
.
lower
()
+
'.'
# Combine the sheets, append the second after the first
capeg
=
capeg
.
append
(
capeg2
,
ignore_index
=
True
)
# Set the name
capeg
[
'Name'
]
=
capeg
[
'target
_CAG
1'
]
capeg
[
'Name'
]
=
capeg
[
'target
L
1'
]
# Set the reference
capeg
[
'Reference'
]
=
''
...
...
@@ -105,10 +112,10 @@ effects_header = ['idEffect', 'CodeSystem', 'Name', 'Description',
'KeyEventCell'
,
'AOPwikiKE'
,
'Reference'
]
dataset
.
effects
.
sheet
=
capeg
.
drop_duplicates
(
subset
=
[
'idEffect'
],
ignore_index
=
True
)[
[
'idEffect'
,
'Name'
,
'Description'
,
'Reference'
]]
subset
=
[
'idEffect'
],
ignore_index
=
True
)[
[
'idEffect'
,
'Name'
,
'Description'
,
'Reference'
]]
dataset
.
effects
.
close
(
header
=
effects_header
)
dataset
.
effects
.
sheet
.
mcra
.
keepcolumn
(
effects_header
)
# SECOND The Assessment group membership models table
# Remove and add used columns to clear them
...
...
@@ -119,11 +126,11 @@ capeg.mcra.addcolumn(['id', 'Name', 'Description', 'Reference'])
capeg
[
'id'
]
=
'AG1-'
+
\
capeg
[
'idEffect'
].
str
.
split
(
'-'
).
str
[
1
:].
str
.
join
(
'-'
)
# Name
capeg
[
'Name'
]
=
'CAG '
+
capeg
[
'target
_CAG
1'
].
str
.
lower
()
capeg
[
'Name'
]
=
'CAG '
+
capeg
[
'target
L
1'
].
str
.
lower
()
# Description
capeg
[
'Description'
]
=
\
'Cummulative assesment group for adverse effects on '
+
\
capeg
[
'target
_CAG
1'
].
str
.
lower
()
+
'.'
capeg
[
'target
L
1'
].
str
.
lower
()
+
'.'
# Reference
capeg
[
'Reference'
]
=
'https://doi.org/10.2903/sp.efsa.2012.EN-269'
...
...
@@ -132,10 +139,10 @@ agmm_header = ['id', 'Name', 'Description', 'idEffect', 'Accuracy',
'Sensitivity'
,
'Specificity'
,
'Reference'
]
dataset
.
agmm
.
sheet
=
capeg
.
drop_duplicates
(
subset
=
[
'id'
],
ignore_index
=
True
)[
subset
=
[
'id'
],
ignore_index
=
True
)[
[
'id'
,
'idEffect'
,
'Name'
,
'Description'
,
'Reference'
]]
dataset
.
agmm
.
close
(
header
=
agmm_header
)
dataset
.
agmm
.
sheet
.
mcra
.
keepcolumn
(
agmm_header
)
# THIRD The Substances table
# Remove and add used columns to clear them
...
...
@@ -148,13 +155,14 @@ capeg['idSubstance'] = capeg['casNumber']
capeg
[
'Name'
]
=
capeg
[
'chemicalName'
]
# Done, now wrap this table up
compound
s_header
=
[
'idSubstance'
,
'Name'
,
'Description'
,
'ConcentrationUnit'
,
'CramerClass'
,
'MolecularMass'
]
substance
s_header
=
[
'idSubstance'
,
'Name'
,
'Description'
,
'ConcentrationUnit'
,
'CramerClass'
,
'MolecularMass'
]
dataset
.
compound
s
.
sheet
=
capeg
.
drop_duplicates
(
subset
=
[
'idSubstance'
],
ignore_index
=
True
)[
dataset
.
substance
s
.
sheet
=
capeg
.
drop_duplicates
(
subset
=
[
'idSubstance'
],
ignore_index
=
True
)[
[
'idSubstance'
,
'Name'
]]
dataset
.
compounds
.
close
(
header
=
compounds_header
)
dataset
.
substances
.
sheet
.
mcra
.
keepcolumn
(
substances_header
)
# FOURTH The Assessment group memberships table
# Remove and add used columns to clear them
...
...
@@ -170,7 +178,7 @@ agm_header = ['idGroupMembershipModel', 'idSubstance', 'GroupMembership']
dataset
.
agm
.
sheet
=
capeg
[
agm_header
].
drop_duplicates
()
dataset
.
agm
.
close
(
header
=
agm_header
)
dataset
.
agm
.
sheet
.
mcra
.
keepcolumn
(
agm_header
)
# DONE
dataset
.
close
()
dataset
.
close
(
file_report
=
True
)
Convert-DTUCAG/Example/capeg_20210706_13492613.xls
View file @
63a099d9
No preview for this file type
Convert-DTUCAG/Input/.gitignore
0 → 100644
View file @
63a099d9
# Ignore everything in this directory
*
# Except this file
!.gitignore
\ No newline at end of file
Convert-DTUCAG/Output/.gitignore
0 → 100644
View file @
63a099d9
# Ignore everything in this directory
*
# Except this file
!.gitignore
\ No newline at end of file
Convert-DTUCAG/
mcra
.py
→
Convert-DTUCAG/
dataconversion
.py
View file @
63a099d9
...
...
@@ -14,10 +14,13 @@ import numpy as np
import
math
import
sys
import
textwrap
import
getpass
# For debugging purposes
# from objbrowser import browse
PY_INDENT
=
' '
thisyear
=
datetime
.
now
().
strftime
(
'%Y'
)
@
pd
.
api
.
extensions
.
register_dataframe_accessor
(
'mcra'
)
...
...
@@ -50,9 +53,20 @@ class McraAccessor:
To easily add a bunch of empty columns
'''
for
col
in
columnnames
:
if
not
col
in
self
.
_obj
.
columns
:
if
col
not
in
self
.
_obj
.
columns
:
self
.
_obj
[
col
]
=
''
def
keepcolumn
(
self
,
columnnames
):
'''
To easily format to fixed set of columns
'''
# Add missing ones, making them empty
for
col
in
columnnames
:
if
col
not
in
self
.
_obj
.
columns
:
self
.
_obj
[
col
]
=
''
# Only retain the ones given.
self
.
_obj
=
self
.
_obj
[
columnnames
]
def
splitjoin
(
self
,
name
,
split
,
join
,
split_sep
=
'-'
,
right_split
=
True
,
join_sep
=
'-'
):
'''
...
...
@@ -67,7 +81,7 @@ class McraAccessor:
df
[
split
]
=
self
.
_obj
[
split
].
str
.
rsplit
(
split_sep
,
n
=
1
).
str
[
0
]
df
[
name
]
=
df
.
loc
[:,
(
join
,
split
)].
apply
(
lambda
x
:
'-'
.
join
(
x
.
dropna
()),
axis
=
1
)
lambda
x
:
join_sep
.
join
(
x
.
dropna
()),
axis
=
1
)
df
=
df
.
drop
([
join
,
split
],
axis
=
1
)
# Not ideal yet, but slightly better than it used to be....
self
.
_obj
=
self
.
_obj
.
merge
(
df
,
left_index
=
True
,
right_index
=
True
)
...
...
@@ -228,43 +242,31 @@ class DataSheet:
self
.
sheet
=
None
self
.
checksum
=
checksum
self
.
properties
=
''
self
.
_
report
=
''
self
.
report
=
''
self
.
direction
=
direction
self
.
autoload
=
autoload
self
.
closed
=
False
@
property
def
report
(
self
):
return
self
.
_report
@
report
.
setter
def
report
(
self
,
report
):
'''
Setting the report property adds text
'''
self
.
_report
=
self
.
_report
+
report
def
make_report
(
self
):
temp
=
self
.
_report
def
get_report
(
self
):
report
=
''
report
+=
'* {dir} file: {file}
\n
'
.
format
(
dir
=
self
.
direction
,
file
=
os
.
path
.
split
(
self
.
file
.
path
)[
1
])
report
+=
textwrap
.
indent
(
'* [{path}]({path})
\n
'
.
format
(
path
=
self
.
file
.
path
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* {props}
\n
'
.
format
(
props
=
self
.
properties
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* Modified: {mod}
\n
'
.
format
(
mod
=
self
.
file
.
modified
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* File size: {size_str} ({size} B)
\n
'
.
format
(
size_str
=
self
.
file
.
size_string
,
size
=
self
.
file
.
size
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* Hash: {hash}
\n\n
'
.
format
(
hash
=
self
.
file
.
hash
),
PY_INDENT
)
self
.
_report
=
report
+
temp
def
clear_report
(
self
):
self
.
_report
=
''
if
self
.
direction
==
'Input'
or
\
(
self
.
direction
==
'Output'
and
self
.
closed
):
report
+=
'* {dir} file: {file}
\n
'
.
format
(
dir
=
self
.
direction
,
file
=
os
.
path
.
split
(
self
.
file
.
path
)[
1
])
report
+=
textwrap
.
indent
(
'* [{path}]({path})
\n
'
.
format
(
path
=
self
.
file
.
path
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* {props}
\n
'
.
format
(
props
=
self
.
properties
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* Modified: {mod}
\n
'
.
format
(
mod
=
self
.
file
.
modified
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* File size: {size_str} ({size} B)
\n
'
.
format
(
size_str
=
self
.
file
.
size_string
,
size
=
self
.
file
.
size
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
'* Hash: {hash}
\n
'
.
format
(
hash
=
self
.
file
.
hash
),
PY_INDENT
)
return
report
def
update_properties
(
self
):
if
self
.
sheet
is
not
None
:
...
...
@@ -301,7 +303,7 @@ class DataSheet:
elif
self
.
file
.
extension
==
'.xls'
:
# Suppress warnings
wb
=
xlrd
.
open_workbook
(
self
.
file
.
path
,
logfile
=
open
(
os
.
devnull
,
'w'
))
logfile
=
open
(
os
.
devnull
,
'w'
))
self
.
sheet
=
pd
.
read_excel
(
wb
,
engine
=
'xlrd'
)
elif
self
.
file
.
extension
==
'.md'
:
f
=
open
(
self
.
file
.
path
,
'r'
)
...
...
@@ -309,9 +311,9 @@ class DataSheet:
f
.
close
()
else
:
# Error here
print
(
' COULD NOT READ {file}'
.
format
(
file
=
self
.
file
.
path
))
print
(
' COULD NOT READ {file}- unknown extenstion.'
.
format
(
file
=
self
.
file
.
path
))
self
.
update_properties
()
self
.
make_report
()
def
save
(
self
,
**
kwargs
):
if
self
.
file
.
extension
==
'.csv'
:
...
...
@@ -333,13 +335,14 @@ class DataSheet:
self
.
file
.
path
,
sheet_name
=
self
.
file
.
default_base
,
**
kwargs
)
else
:
print
(
' COULD NOT WRITE {file} - unknown extenstion.'
.
format
(
file
=
self
.
file
.
path
))
self
.
update_properties
()
def
close
(
self
,
header
=
False
,
auto_report
=
Tru
e
,
also_save
=
True
):
def
close
(
self
,
header
=
False
,
auto_report
=
Fals
e
,
also_save
=
True
):
'''
If auto_report is False, no report on the object will me made.
If the report contains no content, it will not be created as file.
If however, you added something to the report, it WILL be created.
If auto_report is False, no automatic report will be made.
'''
if
header
:
# Make a sheet with the specified header in that order
...
...
@@ -347,14 +350,16 @@ class DataSheet:
self
.
sheet
=
self
.
sheet
[
header
]
if
also_save
:
self
.
save
()
self
.
closed
=
True
self
.
file
.
update
()
self
.
update_properties
()
if
auto_report
:
self
.
make_report
()
if
len
(
self
.
report
)
>
0
:
# Save report
with
open
(
self
.
file
.
reportpath
,
'w+'
)
as
f
:
f
.
write
(
self
.
report
)
self
.
report
+=
self
.
get_report
()
# We are no longer creating an md-report per file
# if len(self.report) > 0:
# # Save report
# with open(self.file.reportpath, 'w+') as f:
# f.write(self.report)
print
(
'Output file: {file}; {props}'
.
format
(
file
=
self
.
file
.
path
,
props
=
self
.
properties
))
...
...
@@ -365,6 +370,10 @@ class DataSet:
self
.
args
=
None
self
.
parser
=
ArgumentParser
(
description
=
description
,
epilog
=
epilog
)
self
.
parser
.
add_argument
(
'-r'
,
'--report'
,
nargs
=
'?'
,
const
=
'Output
\\
Report.md'
,
help
=
'Creates a report file (default: %(const)s).'
)
# The verbosity argument will accept: -v, or -vv, -vvv etc.
# Set default to 1, so that basic output will always appear.
self
.
parser
.
add_argument
(
...
...
@@ -373,11 +382,22 @@ class DataSet:
self
.
parser
.
add_argument
(
'-x'
,
'--example'
,
action
=
'store_const'
,
const
=
'Example'
,
help
=
'Uses input files from the %(const)s subdir.'
)
self
.
parser
.
add_argument
(
'-z'
,
'--zip'
,
nargs
=
'?'
,
const
=
'Output
\\
Output.zip'
,
help
=
'Creates a zip file %(const)s containing all output.'
+
' (default: %(const)s).'
)
if
'-v'
in
sys
.
argv
or
'--verbosity'
in
sys
.
argv
:
print
(
opening
)
self
.
list
=
[]
# Whether or not to create a zip file
self
.
zip
=
None
# The report for the entire dataset
self
.
report
=
''
self
.
runtime
=
datetime
.
now
().
strftime
(
'%H:%M:%S, %d %b %Y'
)
self
.
runcommand
=
' '
.
join
(
sys
.
argv
)
self
.
runarguments
=
' '
.
join
(
sys
.
argv
[
1
:])
self
.
scriptname
=
os
.
path
.
split
(
sys
.
argv
[
0
])[
1
]
self
.
runuser
=
getpass
.
getuser
()
# It is usefull to be able to iterate over all the datasheets.
# Basically, avoid using .list. in all DataSet references.
...
...
@@ -412,7 +432,7 @@ class DataSet:
# But we must do some bookkeeping
self
.
list
.
append
(
name
)
long_argument
=
'--'
+
name
+
'_file'
long_argument
=
'--'
+
name
+
'_file'
if
type
(
help
)
==
str
and
help
is
not
SUPPRESS
:
help
=
help
+
' (default: {default})'
.
format
(
default
=
default_name
)
...
...
@@ -500,34 +520,75 @@ class DataSet:
else
:
# It is an Ouput file
base
,
ext
=
os
.
path
.
splitext
(
datasetfilename
)
if
ext
==
'.zip'
:
# In case of zip file, we will make a csv
datasetfilename
=
base
+
'.csv'
# and also put everything into a zip file
dataset
.
file
.
suggest
(
datasetfilename
)
dataset
.
update_properties
()
basezip
,
extzip
=
os
.
path
.
splitext
(
dataset
.
file
.
path
)
dataset
.
file
.
suggest
(
datasetfilename
)
dataset
.
update_properties
()
if
self
.
args
.
zip
:
# Create a zip file containing everything
basezip
,
extzip
=
os
.
path
.
splitext
(
self
.
args
.
zip
)
# Setting self.zip indicates creating a zip file
self
.
zip
=
basezip
+
'.zip'
else
:
dataset
.
file
.
suggest
(
datasetfilename
)
dataset
.
update_properties
()
def
close
(
self
):
def
save
(
self
):
for
data
in
self
:
if
data
.
direction
==
'Output'
:
if
not
data
.
closed
:
data
.
close
(
auto_report
=
False
,
also_save
=
True
)
def
close
(
self
,
file_report
=
False
,
save
=
True
):
'''
Method to close the dataset.
Most importantly save files.
'''
report_content
=
''
if
file_report
:
report_content
+=
f
'* Script:
{
self
.
scriptname
}
\n
'
report_content
+=
textwrap
.
indent
(
f
'* Command line:
{
self
.
runcommand
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Arguments:
{
self
.
runarguments
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Executed at:
{
self
.
runtime
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Executed by:
{
self
.
runuser
}
\n
'
,
PY_INDENT
)
for
data
in
self
:
if
data
.
direction
==
'Input'
:
report_content
+=
data
.
get_report
()
# Closing every sheet first
for
data
in
self
:
if
data
.
direction
==
'Output'
:
if
not
data
.
closed
:
data
.
close
(
auto_report
=
file_report
,
also_save
=
save
)
if
self
.
args
.
report
:
# Collect reports per sheet.
for
data
in
self
:
report_content
+=
data
.
report
if
len
(
report_content
)
>
0
:
# Report contains information.
if
len
(
self
.
report
)
>
0
:
self
.
report
+=
'
\n
'
self
.
report
+=
report_content
if
len
(
self
.
report
)
>
0
:
# Save report
with
open
(
self
.
args
.
report
,
'w+'
)
as
f
:
f
.
write
(
self
.
report
)
self
.
verbose
(
1
,
'Output file: {file}, containing report on output.'
.
format
(
file
=
self
.
args
.
report
))
if
self
.
zip
:
# All output files will be added to the zip file
self
.
verbose
(
1
,
'Output file: {file}, containing all output.'
.
format
(
file
=
self
.
zip
))
zip
=
zipfile
.
ZipFile
(
self
.
zip
,
'w'
)
for
data
in
self
:
if
data
.
direction
==
'Output'
:
filename
=
os
.
path
.
split
(
data
.
file
.
reportpath
)[
1
]
zip
.
write
(
data
.
file
.
reportpath
,
filename
)
filename
=
os
.
path
.
split
(
data
.
file
.
path
)[
1
]
zip
.
write
(
data
.
file
.
path
,
filename
)
if
self
.
args
.
report
:
filename
=
os
.
path
.
split
(
self
.
args
.
report
)[
1
]
zip
.
write
(
self
.
args
.
report
,
filename
)
zip
.
close
()
self
.
verbose
(
1
,
'Output file: {file}, containing all output.'
.
format
(
file
=
self
.
zip
))
Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
View file @
63a099d9
...
...
@@ -436,5 +436,4 @@ with pd.ExcelWriter(dataset.report.file.path, mode='a') as writer:
sheet_name
=
'Substances'
)
dataset
.
report
.
close
(
auto_report
=
False
,
also_save
=
False
)
dataset
.
close
()
Convert-EUProcessingFactorsDB/Convert-Simple.py
View file @
63a099d9
...
...
@@ -132,7 +132,7 @@ header = [
'idFoodUnProcessed'
,
'FoodUnprocessedName'
,
'idProcessingType'
,
'ProcessingName'
,
'idSubstance'
,
'SubstanceName'
,
'FoodProcessedName'
,
'FoodProcessedName'
,
'Nominal'
,
'Upper'
,
'NominalUncertaintyUpper'
,
'UpperUncertaintyUpper'
,
'Study Reference'
...
...
Convert-EUProcessingFactorsDB/Convert-Simpler.py
0 → 100644
View file @
63a099d9
#!/usr/bin/python
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
from
dataconversion
import
DataSet
,
PY_INDENT
,
thisyear
import
pandas
as
pd
import
textwrap
# These are the files we work with
# Create list
dataset
=
DataSet
(
opening
=
'(c) '
+
thisyear
+
' Biometris, Wageningen University and Research.'
,
description
=
'Creates an MCRA dataset from the '
+
'Processing Factors database on EFSA Zendono.'
,
epilog
=
'For example: use %(prog)s -v -x for a verbose example.'
)
#
# URL source file
efsa_url
=
'https://zenodo.org/record/1488653/files/'
\
+
'EU_Processing_Factors_db_P.xlsx.xlsx?download=1'
#
# The input files
dataset
.
add
(
name
=
'efsa'
,
short_argument
=
'-e'
,
help
=
'The EFSA Zendono Excel sheet (.xlsx); either file or URL. '
,
checksum
=
'f816bf3928431d54f9d15fb134cc9106'
,
default_name
=
efsa_url
,
default_dir
=
'Input'
,
direction
=
'Input'
,
autoload
=
False
)
# No autoload, because sheet is complex
#
# The output files
dataset
.
add
(
name
=
'processing_factor'
,
short_argument
=
'-p'
,
help
=
'The (output) processing factor file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'ProcessingFactors.csv'
,
default_dir
=
'Output'
)
#
dataset
.
add
(
name
=
'references'
,
short_argument
=
'-f'
,
help
=
'The (output) references file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'References.csv'
,
default_dir
=
'Output'
)
#
#############################################################################
# Phase 1. Load data
dataset
.
init
()
# Load the data
# Manually load the EFSA sheet, because the data is in a non-trivial place
efsa_sheet
=
2
efsa_version
=
pd
.
read_excel
(
dataset
.
efsa
.
file
.
path
,
sheet_name
=
efsa_sheet
,
nrows
=
1
,
header
=
None
).
iloc
[
0
,
0
]
dataset
.
efsa
.
load
(
sheet_name
=
efsa_sheet
,
header
=
4
)
dataset
.
verbose
(
1
,
'Input file : {file}; {version}; {props}'
.
format
(
file
=
dataset
.
efsa
.
file
.
path
,
props
=
dataset
.
efsa
.
properties
,
version
=
efsa_version
))
#
# Also reading the ProcStudies Evaluation; using panda directly
# Ok here, because it comes from same file, although not preferred
efsa_procstudies
=
pd
.
read_excel
(
dataset
.
efsa
.
file
.
path
,
sheet_name
=
1
)
# ... and the References, go directly into the sheet.
dataset
.
references
.
sheet
=
pd
.
read_excel
(
dataset
.
efsa
.
file
.
path
,
sheet_name
=
3
)
#############################################################################
# Phase 2. Processing the data.
# Let's first attack the efsa sheet, abbreviate to make life easier
efsa
=
dataset
.
efsa
.
sheet
# First let's copy the columns which we want in the output unaltered so far
efsa
.
mcra
.
copycolumn
({
'Matrix Code'
:
'idFoodUnProcessed'
,
'Raw Primary Commodity'
:
'FoodUnprocessedName'
,
'KeyFacets Code'
:
'idProcessingType'
,
'KeyFacets Interpreted'
:
'ProcessingName'
,
'Matrix FoodEx2 Code'
:
'Matrix FoodEx2 Code'
,
'Matrix Code Interpreted'
:
'FoodProcessedName'
,
'ParamCode Active Substance'
:
'idSubstance'
,
'ParamName Active Substance'
:
'SubstanceName'
,
'Median PF'
:
'Nominal'
})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa
.
mcra
.
addcolumn
({
'Upper'
,
'NominalUncertaintyUpper'
,
'UpperUncertaintyUpper'
})
# Combine with references
efsa_procstudies
=
efsa_procstudies
.
astype
(
'str'
)
refs
=
efsa_procstudies
.
groupby
(
[
'Matrix FoodEx2 Code'
,
'Study Reference'
]
).
size
().
reset_index
().
sort_values
(
by
=
[
'Study Reference'
])
refs
=
refs
[[
'Matrix FoodEx2 Code'
,
'Study Reference'
]]
refs
=
refs
.
groupby
([
'Matrix FoodEx2 Code'
]).
agg
(
lambda
column
:
", "
.
join
(
column
))
efsa
=
efsa
.
merge
(
# Left join with processing type sheet,
refs
,
left_on
=
'Matrix FoodEx2 Code'
,
right_on
=
'Matrix FoodEx2 Code'
,
how
=
'left'
).
assign
()