Commit 85b930a8 authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Convertscript for DTUCAG created.

parent e0e11c39
## Ignore default output files generated by the script.
# User-specific files
# Dirs
Input/
Output/
__pycache__/
\ No newline at end of file
#!/usr/bin/python
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
import mcra
import pandas as pd
from datetime import datetime
import textwrap
import os
# Small utility to create hyperlink to hyperlink :-)
def print_as_link(text):
return '[{text}]({text})'.format(text=text)
# These are the files we work with
# Create list
dataset = mcra.DataSet(
opening='(c) ' + datetime.now().strftime('%Y')
+ ' Biometris, Wageningen University and Research.',
description='Converts the EFSA CAPEG database Excel sheet into MCRA '
+ 'effects and assessment groups.',
epilog='For example: use %(prog)s -v -x for a verbose example.')
#
#
dataset.add(
name='capeg',
short_argument='-c',
help='The (input) EFSA CAPEG file - '
+ 'format: xls (Excel).',
default_name='capeg_20210706_13492613.xls',
default_dir='Input',
direction='Input')
#
# The output files
dataset.add(
name='effects',
short_argument='-e',
help='The (output) effects file - '
+ 'format: csv (Comma Seperated).',
default_name='Effects.csv',
default_dir='Output')
#
dataset.add(
name='agmm',
short_argument='-m',
help='The (output) assessment group membership models file - '
+ 'format: csv (Comma Seperated).',
default_name='AssessmentGroupMembershipModels.csv',
default_dir='Output')
#
dataset.add(
name='agm',
short_argument='-a',
help='The (output) assessment group membership file - '
+ 'format: csv (Comma Seperated).',
default_name='AssessmentGroupMemberships.csv',
default_dir='Output')
#
dataset.add(
name='compounds',
short_argument='-s',
help='The (output) substances file - '
+ 'format: csv (Comma Seperated).',
default_name='Compounds.zip',
# default_name='Compounds.csv',
default_dir='Output')
#
#############################################################################
dataset.init()
# To abbreviate
capeg = dataset.capeg.sheet
# FIRST The effects table
# Add the fields for the effects table
capeg.mcra.addcolumn(
{'idEffect', 'Name', 'Description', 'Reference'})
# Create tempcopy
capeg2 = capeg.copy(deep=True)
# Fill the idEffectA and idEffectC eg. L1-Liver-Acute
capeg['idEffect'] = 'L1-' + \
capeg['target_CAG1'].str.split().str[0].str.strip() + '-Acute'
capeg2['idEffect'] = 'L1-' + \
capeg2['target_CAG1'].str.split().str[0].str.strip() + '-Chronic'
# Description
capeg['Description'] = 'Acute adverse effects on ' + \
capeg['target_CAG1'].str.lower() + '.'
capeg2['Description'] = 'Chronic adverse effects on ' + \
capeg2['target_CAG1'].str.lower() + '.'
# Combine the sheets, append the second after the first
capeg = capeg.append(capeg2, ignore_index=True)
# Set the name
capeg['Name'] = capeg['target_CAG1']
# Set the reference
capeg['Reference'] = ''
# Done, now wrap this table up
effects_header = ['idEffect', 'CodeSystem', 'Name', 'Description',
'BiologicalOrganisation', 'KeyEventProcess',
'KeyEventObject', 'KeyEventAction', 'KeyEventOrgan',
'KeyEventCell', 'AOPwikiKE', 'Reference']
dataset.effects.sheet = capeg.drop_duplicates(
subset = ['idEffect'], ignore_index=True)[
['idEffect','Name','Description','Reference']]
dataset.effects.close(header=effects_header)
# SECOND The Assessment group membership models table
# Remove and add used columns to clear them
capeg.drop(['Name', 'Description', 'Reference'], axis=1, inplace=True)
capeg.mcra.addcolumn(['id', 'Name', 'Description', 'Reference'])
# ID
capeg['id'] = 'AG1-' + \
capeg['idEffect'].str.split('-').str[1:].str.join('-')
# Name
capeg['Name'] = 'CAG ' + capeg['target_CAG1'].str.lower()
# Description
capeg['Description'] = \
'Cummulative assesment group for adverse effects on ' + \
capeg['target_CAG1'].str.lower() + '.'
# Reference
capeg['Reference'] = 'https://doi.org/10.2903/sp.efsa.2012.EN-269'
# Done, now wrap this table up
agmm_header = ['id', 'Name', 'Description', 'idEffect', 'Accuracy',
'Sensitivity', 'Specificity', 'Reference']
dataset.agmm.sheet = capeg.drop_duplicates(
subset = ['id'], ignore_index=True)[
['id', 'idEffect', 'Name', 'Description', 'Reference']]
dataset.agmm.close(header=agmm_header)
# THIRD The Substances table
# Remove and add used columns to clear them
capeg.drop(['Name', 'Description', 'Reference'], axis=1, inplace=True)
capeg.mcra.addcolumn(['idSubstance', 'Name', 'Description'])
# ID
capeg['idSubstance'] = capeg['casNumber']
# Name
capeg['Name'] = capeg['chemicalName']
# Done, now wrap this table up
compounds_header = ['idSubstance', 'Name', 'Description',
'ConcentrationUnit', 'CramerClass', 'MolecularMass']
dataset.compounds.sheet = capeg.drop_duplicates(
subset = ['idSubstance'], ignore_index=True)[
['idSubstance', 'Name']]
dataset.compounds.close(header=compounds_header)
# FOURTH The Assessment group memberships table
# Remove and add used columns to clear them
capeg.drop(['Name', 'Description'], axis=1, inplace=True)
capeg.mcra.addcolumn(['idGroupMembershipModel', 'GroupMembership'])
# ID
capeg['idGroupMembershipModel'] = capeg['id']
capeg['GroupMembership'] = '1'
# Done, now wrap this table up
agm_header = ['idGroupMembershipModel', 'idSubstance', 'GroupMembership']
dataset.agm.sheet = capeg[agm_header].drop_duplicates()
dataset.agm.close(header=agm_header)
# DONE
dataset.close()
# How to use the convert script
## Quick start
### Solve all dependencies
```
pip install pandas xlrd tabulate openpyxl requests
```
### Run the script in trial (-x) and verbose (-v) mode
```
python.exe Convert-EUProcessingFactorsDB.py -x -v
```
Again in trial mode, now using all input files from the Example directory
```
python.exe Convert-EUProcessingFactorsDB.py -x -v -s -g
```
### Run the script with the default names
In the example here also a substance conversion (-s) and FoodTranslation (-g) is done.
```
python.exe Convert-EUProcessingFactorsDB.py -s -g -v
```
### Run the script with specific input files
```
python.exe Convert-EUProcessingFactorsDB.py -v -t ProcessingTypes.csv -p ProcTypeTranslations.csv -f FoodTranslations.csv
```
### Questions?
```
python.exe Convert-EUProcessingFactorsDB.py -h
```
## Introduction
This script takes data from the [EU Processing Factors file](https://zenodo.org/record/1488653/files/EU_Processing_Factors_db_P.xlsx.xlsx?download=1) and combines this with two (user supplied) files (a food translations file and a processing translations file) to get to an MCRA processing factors file with food codes and processing type codes in the desired coding system. In this way, data from the EU processing factors file can be used in MCRA analyses.
These are the input and output files of the script. All names are defaults, and can be changed by the user on the command line.
* Input files:
* The [EU Processing Factors file](https://zenodo.org/record/1488653/files/EU_Processing_Factors_db_P.xlsx.xlsx?download=1)
* The processing translation input file, [ProcTypeTranslations.csv](ProcTypeTranslations.csv)
* The food translation input file, [FoodTranslations.csv](FoodTranslations.csv)
* The processing types input file to augment info in the report, [ProcessingTypes.csv](ProcessingTypes.csv) (not used in any data processing)
* An optional substances sheet (-s), to augment the output with the ``CASNumber``.
* An optional FoodComposition file (-g), to augment the output with A-codes.
* Output files:
* The goal of this script, the file [ProcessingTypes.csv](ProcessingTypes.csv) with the new MCRA ProcessingTypes. By default this file will be contained in a zip file [ProcessingFactors.zip](ProcessingFactors.zip)
* A small markdown report is also created, usally called [Report.md](Report.md), but within the zip file is called Readme.md.
* A csv file with a summary (and counts) of *the remaining data* of the EU sheet, called [Mismatches.csv](Mismatches.csv).
The following is happening in the script, essentially
* The script wil try to match the first column (``FromFC``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) to the column ``KeyFacets Code`` of the EU sheet. If a match is found, then the second column (``FCToProcType``) of [ProcTypeTranslations.csv](ProcTypeTranslations.csv) will become the field ``idProcessingType``.
* Then the script will try to match both the ``FromFX`` and ``FXToRpc`` column of [FoodTranslations.csv](FoodTranslations.csv) with the columns ``Matrix FoodEx2 Code`` and ``Matrix Code`` from the EU sheet, *for all rows that didn't already match in the previous step*. If a match was found, then the value of ``FXToProcType`` will be copied to ``idProcessingType``.
* If no substance file was given, then just copy the field ``ParamCode Active Substance`` to ``idSubstance``. But if a substance was given, then strip the dash from the ``CASNumber`` column in the substance file, and match the column ``ParamCode Active Substance`` in the EFSA sheet to ``code`` in the substances sheet. If a match was found then copy the modified (without dash) ``CASNumber`` to ``idSubstance``.
* If a foodcompositions file was given, then an additional translation is done. This table needs to have the layout of the MCRA FoodComposition.
* Only records of ``idToFood`` starting with ``P`` and ``idFromFood`` which contain a dash (-) will be used
* The ``idFromFood`` column is split on the dash (-)
* A new column is temporarily added combining ``idToFood`` and the right part of the split on ``idFromFood``
* For all matches of the new column with the field ``idFoodProcessed`` in ``ProcessingFactors``, the field ``idFoodProcessed`` will be replaced by the field ``idFromFood`` from the FoodComposition table, and duplicates will also be added
* Finally the output file [ProcessingFactors.csv](ProcessingFactors.csv) (contained within [ProcessingFactors.zip](ProcessingFactors.zip)) will be written, together with some reports.
## Prerequisites
In order to use the python script, the following libraries are necessary
* [pandas](https://pandas.pydata.org/)
* [xlrd](https://pypi.org/project/xlrd/)
* [tabulate](https://pypi.org/project/tabulate/)
* [openpyxl](https://pypi.org/project/openpyxl/)
* [requests](https://pypi.org/project/requests/)
Install all the libraries at once with
```
pip install pandas xlrd tabulate openpyxl requests
```
## Usage
The script will assume defaults for all filenames. The ``-h`` option (help) will display info about which defaults. So the following would produce help information:
```
python.exe convert-script -h
```
Theses are command line options that are supported.
```
usage: Convert-EUProcessingFactorsDB.py [-h] [-v] [-x] [-e [EFSA_FILE]]
[-t [PROCESSING_TYPE_FILE]]
[-p [PROCESSING_TRANSLATION_FILE]]
[-f [FOOD_TRANSLATION_FILE]]
[-s [SUBSTANCE_TRANSLATION_FILE]]
[-g [FOOD_COMPOSITION_FILE]]
[-o [PROCESSING_FACTOR_FILE]]
Converts the EFSA Zendono Excel sheet into an MCRA conforming format, using
some external translation files.
optional arguments:
-h, --help show this help message and exit
-v, --verbosity Show verbose output
-x, --example Uses input files from the Example subdir.
-e [EFSA_FILE], --efsa_file [EFSA_FILE]
The EFSA Zendono Excel sheet (.xlsx); either file or
URL. (default: https://zenodo.org/record/1488653/files
/EU_Processing_Factors_db_P.xlsx.xlsx?download=1)
-t [PROCESSING_TYPE_FILE], --processing_type_file [PROCESSING_TYPE_FILE]
The (input) processing type file - format: csv (Comma
Seperated). (default: ProcessingTypes.csv)
-p [PROCESSING_TRANSLATION_FILE], --processing_translation_file [PROCESSING_TRANSLATION_FILE]
The (input) processing translation file - format: csv
(Comma Seperated). (default: ProcTypeTranslations.csv)
-f [FOOD_TRANSLATION_FILE], --food_translation_file [FOOD_TRANSLATION_FILE]
The (input) food translation file - format: csv (Comma
Seperated). (default: FoodTranslations.csv)
-s [SUBSTANCE_TRANSLATION_FILE], --substance_translation_file [SUBSTANCE_TRANSLATION_FILE]
The (input) substance translation file - format: tsv
(Tab Seperated), file not required. (default:
SubstanceTranslations.tsv)
-g [FOOD_COMPOSITION_FILE], --food_composition_file [FOOD_COMPOSITION_FILE]
The (input) food composition file - format: xlsx
(Excel), file not required. (default:
FoodCompositions.xlsx)
-o [PROCESSING_FACTOR_FILE], --processing_factor_file [PROCESSING_FACTOR_FILE]
The (output) processing factor file - format: csv
(Comma Seperated). (default: ProcessingFactors.zip)
For example: use Convert-EUProcessingFactorsDB.py -v -x for a verbose example.
```
## Coding
If you would like to add code, please try and stick to the [Python Code Guidelines](https://www.python.org/dev/peps/pep-0008/).
Check your changes using ``pycodestyle`` for example.
```
pip install pycodestyle # To install the programm
pycodestyle .\Convert-EUProcessingFactorsDB.py # To check whether the code complies.
```
This diff is collapsed.
pandas
xlrd
tabulate
openpyxl
requests
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment