Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Biometris
MCRA.DataConversionTools
Commits
e8219d11
Commit
e8219d11
authored
Aug 17, 2021
by
Hans van den Heuvel
Browse files
Merged two scripts into one. Issue 1087.
parent
8d082b20
Changes
4
Hide whitespace changes
Inline
Side-by-side
Convert-EUProcessingFactorsDB/Compile-EUProcessingFactorsDB_MCRA.py
View file @
e8219d11
#!/usr/bin/python
__version_info__
=
(
'1'
,
'
0
'
,
'
4
'
)
__version_info__
=
(
'1'
,
'
1
'
,
'
0
'
)
__version__
=
'.'
.
join
(
__version_info__
)
#############################################################################
...
...
@@ -8,6 +8,7 @@ __version__ = '.'.join(__version_info__)
# Doing stuff like parsing arguments, and reading the files.
#
from
dataconversion
import
DataSet
,
PY_INDENT
,
thisyear
from
datetime
import
datetime
import
pandas
as
pd
import
textwrap
...
...
@@ -53,6 +54,46 @@ dataset.add(
inzip
=
True
,
# Copy this file into the zip
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'processing_type'
,
short_argument
=
'-t'
,
help
=
'The (input) processing type file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
None
,
necessary
=
False
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'processing_translation'
,
short_argument
=
'-q'
,
help
=
'The (input) processing translation file - '
+
'format: csv (Comma Seperated).'
,
necessary
=
False
,
default_name
=
None
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'food_translation'
,
short_argument
=
'-f'
,
help
=
'The (input) food translation file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
None
,
necessary
=
False
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'food_composition'
,
short_argument
=
'-g'
,
help
=
'The (input) food composition file - '
+
'format: xlsx (Excel), file not required.'
,
default_name
=
None
,
necessary
=
False
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
# The output files
# The (main) processing factors table
...
...
@@ -63,11 +104,18 @@ dataset.add(
+
'format: csv (Comma Seperated).'
,
default_name
=
'ProcessingFactors.csv'
,
default_dir
=
'Output'
)
#
dataset
.
add
(
name
=
'mismatches'
,
short_argument
=
'-m'
,
default_name
=
None
,
necessary
=
False
,
inzip
=
True
,
default_dir
=
'Output'
)
# References
dataset
.
add
(
name
=
'references'
,
short_argument
=
'-
f
'
,
short_argument
=
'-
b
'
,
help
=
'The (output) references file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'References.csv'
,
...
...
@@ -87,10 +135,7 @@ efsa_version = pd.read_excel(
nrows
=
1
,
header
=
None
).
iloc
[
0
,
0
]
dataset
.
efsa
.
load
(
sheet_name
=
efsa_sheet
,
header
=
4
)
dataset
.
verbose
(
1
,
'Input file : {file}; {version}; {props}'
.
format
(
file
=
dataset
.
efsa
.
file
.
path
,
props
=
dataset
.
efsa
.
properties
,
version
=
efsa_version
))
dataset
.
verbose
(
1
,
f
'Input file :
{
dataset
.
efsa
.
file
.
path
}
;
{
efsa_version
}
;
{
dataset
.
efsa
.
properties
}
'
)
#
# Also reading the ProcStudies Evaluation; using panda directly
# Ok here, because it comes from same file, although not preferred
...
...
@@ -108,7 +153,6 @@ efsa = dataset.efsa.sheet
efsa
.
mcra
.
copycolumn
({
'Matrix Code'
:
'idFoodUnProcessed'
,
'Raw Primary Commodity'
:
'FoodUnprocessedName'
,
'KeyFacets Code'
:
'idProcessingType'
,
'KeyFacets Interpreted'
:
'ProcessingName'
,
'Matrix FoodEx2 Code'
:
'idFoodProcessed'
,
'Matrix Code Interpreted'
:
'FoodProcessedName'
,
...
...
@@ -117,12 +161,43 @@ efsa.mcra.copycolumn({
'Median PF'
:
'Nominal'
})
# See whether we can do something with Substance translation
if
not
dataset
.
exists
(
'substance_translation'
):
# No substance translation? Just copy column
efsa
.
mcra
.
copycolumn
(
{
'ParamCode Active Substance'
:
'idSubstance'
})
#
if
dataset
.
exists
(
'processing_translation'
)
and
dataset
.
exists
(
'food_translation'
):
dataset
.
verbose
(
3
,
f
'Using sheet: Processing Translation.'
)
dataset
.
verbose
(
3
,
f
'Using sheet: Food Translation.'
)
efsa
=
dataset
.
efsa
.
sheet
.
merge
(
# Left join on all the rows from the EFSA sheet
# that have a Keyfacets Code in dataset.processing_translation.sheet
dataset
.
processing_translation
.
sheet
,
left_on
=
'KeyFacets Code'
,
right_on
=
'FromFC'
,
how
=
'left'
).
merge
(
# Left join with both FoodEx2 and Matrix code
# on the food_translation file
dataset
.
food_translation
.
sheet
,
left_on
=
[
'Matrix FoodEx2 Code'
,
'Matrix Code'
],
right_on
=
[
'FromFX'
,
'FXToRpc'
],
how
=
'left'
).
assign
(
)
# idProcessingType
#
# If 'FCToProcType' contains a value, then make a new field
# 'idProcessingType', with the value from 'FCToProcType'
efsa
.
loc
[
(
efsa
[
'FCToProcType'
].
notna
()),
'idProcessingType'
]
=
efsa
[
'FCToProcType'
]
# If 'FCToProcType' does not contain a value and 'FXToProcType' does
# then make a new field, 'idProcessingType'
# with the value from 'FXToProcType'
efsa
.
loc
[
(
efsa
[
'FCToProcType'
].
isna
()
&
efsa
[
'FXToProcType'
].
notna
()),
'idProcessingType'
]
=
efsa
[
'FXToProcType'
]
#
else
:
# Just copy the column
efsa
.
mcra
.
copycolumn
({
'KeyFacets Code'
:
'idProcessingType'
})
# See whether we can do something with Substance translation
if
dataset
.
exists
(
'substance_translation'
):
dataset
.
verbose
(
3
,
f
'Using sheet: Substance Translation.'
)
if
'CASNumber'
in
dataset
.
substance_translation
.
sheet
.
columns
:
# This is the "old" situation
# Strip dash (-) from the CASNumber column
...
...
@@ -146,7 +221,56 @@ else:
how
=
'left'
).
assign
()
# Copy ToCode to idSubstance column
efsa
.
mcra
.
copycolumn
({
'ToCode'
:
'idSubstance'
})
else
:
# No substance translation? Just copy column
efsa
.
mcra
.
copycolumn
(
{
'ParamCode Active Substance'
:
'idSubstance'
})
#
# Use the description of Processing Type
if
dataset
.
exists
(
'processing_type'
):
dataset
.
verbose
(
3
,
f
'Using sheet: Processing Type.'
)
efsa
=
efsa
.
merge
(
# Left join with processing type sheet,
dataset
.
processing_type
.
sheet
,
left_on
=
'idProcessingType'
,
right_on
=
'idProcessingType'
,
how
=
'left'
).
assign
()
# Copy column
efsa
.
mcra
.
copycolumn
(
{
'Description'
:
'MCRA_ProcessingType_Description'
})
# idFoodProcessed
# Just concat idFoodUnProcessed with idProcessingType with a dash
efsa
.
loc
[
(
efsa
[
'idProcessingType'
].
notna
()),
'idFoodProcessed'
]
=
efsa
[
'idFoodUnProcessed'
].
astype
(
str
)
\
+
'-'
+
efsa
[
'idProcessingType'
].
astype
(
str
)
else
:
efsa
.
mcra
.
addcolumn
({
'Description'
})
efsa
.
mcra
.
copycolumn
({
'Matrix FoodEx2 Code'
:
'idFoodProcessed'
})
if
dataset
.
exists
(
'food_composition'
):
dataset
.
verbose
(
3
,
f
'Using sheet: Food Composition.'
)
# We also have to do the food_composition translation
# First remove all but keep the P-code data
# Also use shorter name:
fcs
=
dataset
.
food_composition
.
sheet
[(
dataset
.
food_composition
.
sheet
[
'idToFood'
].
str
.
startswith
(
'P'
)
&
dataset
.
food_composition
.
sheet
[
'idFromFood'
].
str
.
contains
(
'-'
))]
fcs
=
fcs
.
mcra
.
splitjoin
(
name
=
'idToFood-PC'
,
split
=
'idFromFood'
,
join
=
'idToFood'
)
# Then a left join to combine
efsa
=
efsa
.
merge
(
# Left join with processing type sheet,
fcs
,
left_on
=
'idFoodProcessed'
,
right_on
=
'idToFood-PC'
,
how
=
'left'
).
assign
()
efsa
.
loc
[
(
efsa
[
'idToFood-PC'
].
notna
()
&
efsa
[
'idFoodProcessed'
].
str
.
contains
(
'-'
)),
'idFoodProcessed'
]
=
efsa
[
'idFromFood'
]
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa
.
mcra
.
addcolumn
({
'Upper'
,
...
...
@@ -177,16 +301,171 @@ header = [
'Nominal'
,
'Upper'
,
'NominalUncertaintyUpper'
,
'UpperUncertaintyUpper'
,
'Study Reference'
]
dataset
.
processing_factor
.
sheet
=
efsa
[
(
efsa
[
"idProcessingType"
]
!=
"-"
)
&
efsa
[
'idSubstance'
].
notna
()
][
header
]
if
dataset
.
exists
(
'processing_translation'
)
and
dataset
.
exists
(
'food_translation'
):
header
=
[
'idProcessingType'
,
'idSubstance'
,
'SubstanceName'
,
'idFoodProcessed'
,
'idFoodUnProcessed'
,
'FoodUnprocessedName'
,
'Nominal'
,
'Upper'
,
'NominalUncertaintyUpper'
,
'UpperUncertaintyUpper'
,
'KeyFacets Interpreted'
,
'Matrix Code Interpreted'
,
'MCRA_ProcessingType_Description'
,
'Study Reference'
]
dataset
.
processing_factor
.
sheet
=
efsa
[
(
efsa
[
'FCToProcType'
].
notna
()
|
efsa
[
'FXToProcType'
].
notna
())
&
efsa
[
'idSubstance'
].
notna
()][
header
]
else
:
header
=
[
'idFoodProcessed'
,
'Matrix Code Interpreted'
,
'idFoodUnProcessed'
,
'Matrix Code'
,
'RPC Code'
,
'FoodUnprocessedName'
,
'idProcessingType'
,
'KeyFacets Code'
,
'KeyFacets Interpreted'
,
'idSubstance'
,
'ParamCode Active Substance'
,
'SubstanceName'
,
'Nominal'
,
'Upper'
,
'NominalUncertaintyUpper'
,
'UpperUncertaintyUpper'
,
'Study Reference'
]
dataset
.
processing_factor
.
sheet
=
efsa
[
(
efsa
[
"idProcessingType"
]
!=
"-"
)
&
efsa
[
'idSubstance'
].
notna
()
][
header
]
#############################################################################
# Phase 3. Report about the data.
# This is the way to go if you want to know if a sheet exists...
# if dataset.exists('substance_translation'):
# print('Yahoo')
# Here's an auto generated report
if
dataset
.
exists
(
'mismatches'
,
sheet_too
=
False
):
dataset
.
verbose
(
3
,
f
'Build sheet: Mismatches.'
)
# Let's create a sheet with an overview over the codes.
# This is the sheet, only the ones lacking 'idProcessingType' (not exported)
mismatch_table
=
efsa
.
loc
[(
efsa
[
'idProcessingType'
].
isna
())]
report_sheet
=
mismatch_table
.
groupby
(
[
'KeyFacets Code'
,
'Matrix FoodEx2 Code'
]).
size
().
reset_index
(
name
=
'Size'
).
merge
(
mismatch_table
[[
'KeyFacets Code'
,
'Matrix FoodEx2 Code'
]].
groupby
(
'KeyFacets Code'
).
size
().
reset_index
(
name
=
'Size'
),
left_on
=
'KeyFacets Code'
,
right_on
=
'KeyFacets Code'
,
how
=
'left'
).
assign
()
report_sheet
.
rename
(
columns
=
{
'Size_y'
:
'Number of KeyFacets Codes'
,
'Size_x'
:
'Number of Matrix FoodEx2 Codes'
},
inplace
=
True
)
report_sheet
.
sort_values
(
by
=
[
'Number of KeyFacets Codes'
,
'KeyFacets Code'
,
'Number of Matrix FoodEx2 Codes'
,
'Matrix FoodEx2 Code'
],
ascending
=
False
,
inplace
=
True
)
report_sheet
=
report_sheet
.
merge
(
mismatch_table
[
[
'KeyFacets Code'
,
'KeyFacets Interpreted'
]].
drop_duplicates
(),
left_on
=
'KeyFacets Code'
,
right_on
=
'KeyFacets Code'
,
how
=
'left'
).
assign
()
report_sheet
=
report_sheet
.
merge
(
mismatch_table
[
[
'Matrix FoodEx2 Code'
,
'Matrix Code Interpreted'
,
'Matrix Code'
]].
drop_duplicates
(),
left_on
=
'Matrix FoodEx2 Code'
,
right_on
=
'Matrix FoodEx2 Code'
,
how
=
'left'
).
assign
()
# print(report_sheet)
min_nr_of_mismatches
=
int
(
5
)
mismatch_table_string
=
report_sheet
[
(
report_sheet
[
'Number of KeyFacets Codes'
]
>=
min_nr_of_mismatches
)
&
(
report_sheet
[
'Number of Matrix FoodEx2 Codes'
]
>=
min_nr_of_mismatches
)
].
to_markdown
(
index
=
False
)
# Dump the mismatch file
# We want a specific order in the columns:
header
=
[
'Matrix FoodEx2 Code'
,
'Matrix Code Interpreted'
,
'Matrix Code'
,
'KeyFacets Code'
,
'KeyFacets Interpreted'
,
'Number of Matrix FoodEx2 Codes'
,
'Number of KeyFacets Codes'
]
dataset
.
mismatches
.
sheet
=
report_sheet
[
header
]
# We also need some further text reporting:
# Let's make a new column of the combination
# of 'idSubstance' and 'idFoodUnProcessed'
mismatch_table
=
efsa
[
(
efsa
[
'FCToProcType'
].
notna
()
|
efsa
[
'FXToProcType'
].
notna
())
&
efsa
[
'idSubstance'
].
notna
()]
mismatch_table
=
mismatch_table
.
mcra
.
join
(
name
=
'idSubstanceFoodProc'
,
join_left
=
'idSubstance'
,
join_right
=
'idFoodUnProcessed'
)
double_types
=
mismatch_table
.
groupby
(
[
'idProcessingType'
,
'idSubstanceFoodProc'
],
as_index
=
False
).
agg
(
{
'idSubstance'
:
'first'
,
'idFoodUnProcessed'
:
'first'
,
'FoodUnprocessedName'
:
'first'
,
'KeyFacets Interpreted'
:
'first'
,
'Matrix Code Interpreted'
:
'first'
,
'MCRA_ProcessingType_Description'
:
'first'
}).
drop
(
'idSubstanceFoodProc'
,
axis
=
1
)
#
#
dataset
.
mismatches
.
report
=
r
'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
Conversion run details
======================
* Date: '''
+
datetime
.
now
().
strftime
(
'%H:%M:%S, %d %b %Y'
)
+
r
'''
* Files:
'''
for
data
in
dataset
:
if
data
.
direction
==
'Input'
:
dataset
.
mismatches
.
report
=
textwrap
.
indent
(
data
.
report
,
PY_INDENT
)
for
datasetname
in
dataset
.
list
:
# Bit of a hack, figure out later how this can be properly done.
if
getattr
(
dataset
,
datasetname
).
direction
==
'Output'
\
and
datasetname
!=
'report'
:
dataset
.
mismatches
.
report
=
textwrap
.
indent
(
getattr
(
dataset
,
datasetname
).
report
,
PY_INDENT
)
dataset
.
mismatches
.
report
=
r
'''
EFSA Excel input details
========================
* Excel input: ['''
+
dataset
.
efsa
.
file
.
path
+
r
''']('''
+
dataset
.
efsa
.
file
.
path
+
r
''')
* '''
+
efsa_version
+
r
'''
* '''
+
dataset
.
efsa
.
properties
+
r
'''
* Modified: '''
+
dataset
.
efsa
.
file
.
modified
+
r
'''
* Processing type translation
* Number of rows matched '''
+
str
(
len
(
efsa
.
loc
[
efsa
[
'FCToProcType'
].
notna
()].
index
))
+
r
'''
* Food Translation
* Number of rows matched '''
+
str
(
len
(
efsa
.
loc
[
efsa
[
'FXToProcType'
].
notna
()].
index
))
+
r
'''
* All translations
* Number of rows matched '''
+
str
(
len
(
efsa
.
loc
[
efsa
[
'idProcessingType'
].
notna
()
].
index
))
+
r
'''
EFSA Excel Merge failures
=========================
* Number input rows '''
+
str
(
len
(
efsa
.
index
))
+
r
'''
* Translations matched '''
+
str
(
len
(
efsa
.
loc
[
efsa
[
'idProcessingType'
].
notna
()
].
index
))
+
r
'''
* Translations not matched '''
+
str
(
len
(
mismatch_table
.
index
))
+
r
'''
* Unique KeyFacets codes '''
+
str
(
mismatch_table
[
'KeyFacets Code'
].
nunique
())
+
r
'''
* Unique FoodEx2 codes '''
+
str
(
mismatch_table
[
'Matrix FoodEx2 Code'
].
nunique
())
+
r
'''
Below a list with the most (more than '''
+
str
(
min_nr_of_mismatches
)
+
r
''') mismatches.
'''
+
mismatch_table_string
+
r
'''
Substance conversion duplicates
===============================
'''
+
double_types
.
to_markdown
(
index
=
False
)
+
r
'''
'''
dataset
.
close
()
Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
deleted
100644 → 0
View file @
8d082b20
#!/usr/bin/python
__version_info__
=
(
'1'
,
'0'
,
'0'
)
__version__
=
'.'
.
join
(
__version_info__
)
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
from
dataconversion
import
DataSet
,
PY_INDENT
,
thisyear
import
pandas
as
pd
from
datetime
import
datetime
import
textwrap
import
os
# Small utility to create hyperlink to hyperlink :-)
def
print_as_link
(
text
):
return
'[{text}]({text})'
.
format
(
text
=
text
)
# These are the files we work with
# Create list
dataset
=
DataSet
(
opening
=
'(c) '
+
thisyear
+
' Biometris, Wageningen University and Research.'
,
description
=
'Converts the EFSA Zendono Excel sheet into an MCRA '
+
'conforming format, using some external translation files.'
,
epilog
=
'For example: use %(prog)s -v -x for a verbose example.'
,
version
=
__version__
)
#
#
efsa_url
=
'https://zenodo.org/record/1488653/files/'
\
+
'EU_Processing_Factors_db_P.xlsx.xlsx?download=1'
#
# The input files
dataset
.
add
(
name
=
'efsa'
,
short_argument
=
'-e'
,
help
=
'The EFSA Zendono Excel sheet (.xlsx); either file or URL. '
,
checksum
=
'f816bf3928431d54f9d15fb134cc9106'
,
default_name
=
efsa_url
,
default_dir
=
'Input'
,
direction
=
'Input'
,
autoload
=
False
)
#
dataset
.
add
(
name
=
'processing_type'
,
short_argument
=
'-t'
,
help
=
'The (input) processing type file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'ProcessingTypes.csv'
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'processing_translation'
,
short_argument
=
'-p'
,
help
=
'The (input) processing translation file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'ProcTypeTranslations.csv'
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'food_translation'
,
short_argument
=
'-f'
,
help
=
'The (input) food translation file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'FoodTranslations.csv'
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'substance_translation'
,
short_argument
=
'-s'
,
help
=
'The (input) substance translation file - '
+
'format: tsv (Tab Seperated), file not required.'
,
default_name
=
'SubstanceTranslations.csv'
,
necessary
=
False
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
dataset
.
add
(
name
=
'food_composition'
,
short_argument
=
'-g'
,
help
=
'The (input) food composition file - '
+
'format: xlsx (Excel), file not required.'
,
default_name
=
'FoodCompositions.xlsx'
,
necessary
=
False
,
default_dir
=
'Input'
,
direction
=
'Input'
)
#
# The output files
dataset
.
add
(
name
=
'processing_factor'
,
short_argument
=
'-o'
,
help
=
'The (output) processing factor file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'ProcessingFactors.csv'
,
default_dir
=
'Output'
)
#
dataset
.
add
(
name
=
'mismatches'
,
default_name
=
'Mismatches.xlsx'
,
default_dir
=
'Output'
)
#
dataset
.
add
(
name
=
'references'
,
default_name
=
'References.csv'
,
default_dir
=
'Output'
)
#
#############################################################################
dataset
.
init
()
# Manually load the EFSA sheet, because the data is in a non-trivial place
efsa_sheet
=
2
efsa_version
=
pd
.
read_excel
(
dataset
.
efsa
.
file
.
path
,
sheet_name
=
efsa_sheet
,
nrows
=
1
,
header
=
None
).
iloc
[
0
,
0
]
dataset
.
efsa
.
load
(
sheet_name
=
efsa_sheet
,
header
=
4
)
dataset
.
verbose
(
1
,
'Input file : {file}; {version}; {props}'
.
format
(
file
=
dataset
.
efsa
.
file
.
path
,
props
=
dataset
.
efsa
.
properties
,
version
=
efsa_version
))
#
# Also reading the ProcStudies Evaluation
efsa_procstudies
=
pd
.
read_excel
(
dataset
.
efsa
.
file
.
path
,
sheet_name
=
1
)
# ... and the References
dataset
.
references
.
sheet
=
pd
.
read_excel
(
dataset
.
efsa
.
file
.
path
,
sheet_name
=
3
)
#############################################################################
# Phase 2. Processing the data.
# Try to think SQL-wise or vector-wise about the data,
# but object-wise about the code
# We use the input Excel sheet as an SQL table, and supplement it
# until we have additional columns with all necessary data.
#
#
# Here we'll left join with both tables to supplement the original sheet.
# Then we have all data in one single dataframe (table).
efsa_combined
=
dataset
.
efsa
.
sheet
.
merge
(
# Left join on all the rows from the EFSA sheet
# that have a Keyfacets Code in dataset.processing_translation.sheet
dataset
.
processing_translation
.
sheet
,
left_on
=
'KeyFacets Code'
,
right_on
=
'FromFC'
,
how
=
'left'
).
merge
(
# Left join with both FoodEx2 and Matrix code
# on the food_translation file
dataset
.
food_translation
.
sheet
,
left_on
=
[
'Matrix FoodEx2 Code'
,
'Matrix Code'
],
right_on
=
[
'FromFX'
,
'FXToRpc'
],
how
=
'left'
).
assign
(
)
# First let's copy the columns which we want in the output unaltered so far
efsa_combined
.
mcra
.
copycolumn
({
'ParamName Active Substance'
:
'SubstanceName'
,
'Matrix Code'
:
'idFoodUnProcessed'
,
'Raw Primary Commodity'
:
'FoodUnprocessedName'
,
'Median PF'
:
'Nominal'
})
#
# Then let's add columns which will be empty
# so to be able to create a proper output file
efsa_combined
.
mcra
.
addcolumn
({
'Upper'
,
'NominalUncertaintyUpper'
,
'UpperUncertaintyUpper'
})
#
# Now let's work on creating the rest of the fields (the hard labour)
#
# idProcessingType
#
# If 'FCToProcType' contains a value, then make a new field
# 'idProcessingType', with the value from 'FCToProcType'
efsa_combined
.
loc
[
(
efsa_combined
[
'FCToProcType'
].
notna
()),
'idProcessingType'
]
=
efsa_combined
[
'FCToProcType'
]
# If 'FCToProcType' does not contain a value and 'FXToProcType' does
# then make a new field, 'idProcessingType'
# with the value from 'FXToProcType'
efsa_combined
.
loc
[
(
efsa_combined
[
'FCToProcType'
].
isna
()
&
efsa_combined
[
'FXToProcType'
].
notna
()),
'idProcessingType'
]
=
efsa_combined
[
'FXToProcType'
]
#
# See whether we can do something with FoodSubstances
if
dataset
.
substance_translation
.
sheet
is
None
:
# No substance translation? Just copy column
efsa_combined
.
mcra
.
copycolumn
(
{
'ParamCode Active Substance'
:
'idSubstance'
})
else
:
# Strip dash (-) from the CASNumber column
dataset
.
substance_translation
.
sheet
[
'CASNumber'
].
replace
(
'-'
,
''
,
regex
=
True
,
inplace
=
True
)
# Do a left join
efsa_combined
=
efsa_combined
.
merge
(
# Left join with processing type sheet,
dataset
.
substance_translation
.
sheet
,