Commit 74a7ea2f authored by Hans van den Heuvel's avatar Hans van den Heuvel
Browse files

Replace multiple casNumbers in one row into single casNumber in multiple rows.

parent 10018fc0
#!/usr/bin/python
__version_info__ = ('1', '0', '0')
__version_info__ = ('1', '1', '0')
__version__ = '.'.join(__version_info__)
#############################################################################
......@@ -88,6 +88,19 @@ capeg['casNumber'].replace('na', '7440-50-8', inplace=True)
# capeg.drop(capeg.loc[capeg['casNumber']
# .str.contains('[\(\)/]', regex=True)].index, inplace=True)
# Handle every obscure case separate, just to be explicit.
# First case: 824-39-5/26498-36-2
# beceomes 824-39-5 and 26498-36-2
capeg = capeg.mcra.dup_reggroups('casNumber', '([0-9\-]*)\s?/\s?([0-9\-]*)')
# Second case: 468-44-0 + 510-75-8 (mixture 8030-53-3)
# becomes 468-44-0 and 510-75-8
capeg = capeg.mcra.dup_reggroups('casNumber', '([0-9\-]*)\s?\+\s?([0-9\-]*)')
# Third case: 71751-41-2 (65195-55-3 B1a, 65195-56-4 B1b)
# becomes 65195-55-3, 65195-56-4
capeg = capeg.mcra.dup_reggroups('casNumber', '\(([0-9\-]*).*\,\s+([0-9\-]*)')
# Fourth case: 8018-01-7 (formerly 8065-67-6)
# becomes 8018-01-7
capeg = capeg.mcra.dup_reggroups('casNumber', '^([0-9\-]*)')
# Max length of strings (second argument)
max_len = slice(0,99)
......
......@@ -17,7 +17,7 @@ import textwrap
import getpass
import re
__version_info__ = ('0', '9', '0')
__version_info__ = ('0', '9', '1')
__version__ = '.'.join(__version_info__)
# For debugging purposes
......@@ -118,6 +118,16 @@ class McraAccessor:
elif ext == '.xlsx':
self._obj.to_excel(filename, sheet_name='Dump', index=False)
def dup_reggroups(self, column, regex):
temp_col = column+'__temp__'
dups=self._obj[column].str.extractall(regex)
dups[temp_col]=dups.values.tolist()
dups=dups.reset_index(level=[1])
self._obj=self._obj.join(dups[temp_col]).explode(temp_col).reset_index(drop=True)
self._obj.loc[(self._obj[temp_col].notna()),column]=self._obj[temp_col]
self._obj.drop(columns=temp_col, inplace=True)
return self._obj
class DataFile:
'''
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment