Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Biometris
MCRA.DataConversionTools
Commits
74a7ea2f
Commit
74a7ea2f
authored
Jul 12, 2021
by
Hans van den Heuvel
Browse files
Replace multiple casNumbers in one row into single casNumber in multiple rows.
parent
10018fc0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Convert-DTUCAG/Convert-DTUCAG.py
View file @
74a7ea2f
#!/usr/bin/python
__version_info__
=
(
'1'
,
'
0
'
,
'0'
)
__version_info__
=
(
'1'
,
'
1
'
,
'0'
)
__version__
=
'.'
.
join
(
__version_info__
)
#############################################################################
...
...
@@ -88,6 +88,19 @@ capeg['casNumber'].replace('na', '7440-50-8', inplace=True)
# capeg.drop(capeg.loc[capeg['casNumber']
# .str.contains('[\(\)/]', regex=True)].index, inplace=True)
# Handle every obscure case separate, just to be explicit.
# First case: 824-39-5/26498-36-2
# beceomes 824-39-5 and 26498-36-2
capeg
=
capeg
.
mcra
.
dup_reggroups
(
'casNumber'
,
'([0-9\-]*)\s?/\s?([0-9\-]*)'
)
# Second case: 468-44-0 + 510-75-8 (mixture 8030-53-3)
# becomes 468-44-0 and 510-75-8
capeg
=
capeg
.
mcra
.
dup_reggroups
(
'casNumber'
,
'([0-9\-]*)\s?\+\s?([0-9\-]*)'
)
# Third case: 71751-41-2 (65195-55-3 B1a, 65195-56-4 B1b)
# becomes 65195-55-3, 65195-56-4
capeg
=
capeg
.
mcra
.
dup_reggroups
(
'casNumber'
,
'\(([0-9\-]*).*\,\s+([0-9\-]*)'
)
# Fourth case: 8018-01-7 (formerly 8065-67-6)
# becomes 8018-01-7
capeg
=
capeg
.
mcra
.
dup_reggroups
(
'casNumber'
,
'^([0-9\-]*)'
)
# Max length of strings (second argument)
max_len
=
slice
(
0
,
99
)
...
...
Convert-DTUCAG/dataconversion.py
View file @
74a7ea2f
...
...
@@ -17,7 +17,7 @@ import textwrap
import
getpass
import
re
__version_info__
=
(
'0'
,
'9'
,
'
0
'
)
__version_info__
=
(
'0'
,
'9'
,
'
1
'
)
__version__
=
'.'
.
join
(
__version_info__
)
# For debugging purposes
...
...
@@ -118,6 +118,16 @@ class McraAccessor:
elif
ext
==
'.xlsx'
:
self
.
_obj
.
to_excel
(
filename
,
sheet_name
=
'Dump'
,
index
=
False
)
def
dup_reggroups
(
self
,
column
,
regex
):
temp_col
=
column
+
'__temp__'
dups
=
self
.
_obj
[
column
].
str
.
extractall
(
regex
)
dups
[
temp_col
]
=
dups
.
values
.
tolist
()
dups
=
dups
.
reset_index
(
level
=
[
1
])
self
.
_obj
=
self
.
_obj
.
join
(
dups
[
temp_col
]).
explode
(
temp_col
).
reset_index
(
drop
=
True
)
self
.
_obj
.
loc
[(
self
.
_obj
[
temp_col
].
notna
()),
column
]
=
self
.
_obj
[
temp_col
]
self
.
_obj
.
drop
(
columns
=
temp_col
,
inplace
=
True
)
return
self
.
_obj
class
DataFile
:
'''
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment