Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Biometris
MCRA.DataConversionTools
Commits
f0f08a46
Commit
f0f08a46
authored
Jul 12, 2021
by
Hans van den Heuvel
Browse files
Fixed ProcessingFactors dir
parent
74a7ea2f
Changes
13
Hide whitespace changes
Inline
Side-by-side
Convert-EUProcessingFactorsDB/.gitignore
View file @
f0f08a46
## Ignore default output files generated by the script.
# User-specific files
ProcessingFactors.zip
ProcessingFactors.csv
ProcessingFactors.xlsx
Mismatches.csv
Report.md
debug_dump_file.xlsx
EU_Processing_Factors_db_P.xlsx.xlsx
# Dirs
Input/
Output/
Build/
__pycache__/
\ No newline at end of file
Convert-EUProcessingFactorsDB/Convert-EUProcessingFactorsDB.py
View file @
f0f08a46
#!/usr/bin/python
__version_info__
=
(
'1'
,
'0'
,
'0'
)
__version__
=
'.'
.
join
(
__version_info__
)
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
#
import
mcra
from
dataconversion
import
DataSet
,
PY_INDENT
,
thisyear
import
pandas
as
pd
from
datetime
import
datetime
import
textwrap
import
os
# Small utility to create hyperlink to hyperlink :-)
def
print_as_link
(
text
):
return
'[{text}]({text})'
.
format
(
text
=
text
)
# These are the files we work with
# Create list
dataset
=
mcra
.
DataSet
(
opening
=
'(c) '
+
datetime
.
now
().
strftime
(
'%Y'
)
dataset
=
DataSet
(
opening
=
'(c) '
+
thisyear
+
' Biometris, Wageningen University and Research.'
,
description
=
'Converts the EFSA Zendono Excel sheet into an MCRA '
+
'conforming format, using some external translation files.'
,
epilog
=
'For example: use %(prog)s -v -x for a verbose example.'
)
epilog
=
'For example: use %(prog)s -v -x for a verbose example.'
,
version
=
__version__
)
#
#
efsa_url
=
'https://zenodo.org/record/1488653/files/'
\
...
...
@@ -92,13 +95,12 @@ dataset.add(
short_argument
=
'-o'
,
help
=
'The (output) processing factor file - '
+
'format: csv (Comma Seperated).'
,
default_name
=
'ProcessingFactors.zip'
,
# default_name='ProcessingFactors.csv',
default_name
=
'ProcessingFactors.csv'
,
default_dir
=
'Output'
)
#
dataset
.
add
(
name
=
'
report
'
,
default_name
=
'
Report
.xlsx'
,
name
=
'
mismatches
'
,
default_name
=
'
Mismatches
.xlsx'
,
default_dir
=
'Output'
)
#
dataset
.
add
(
...
...
@@ -273,13 +275,11 @@ dataset.processing_factor.sheet = efsa_combined[
efsa_combined
[
'FXToProcType'
].
notna
())
&
efsa_combined
[
'idSubstance'
].
notna
()][
header
]
#
# Writing output file
dataset
.
processing_factor
.
close
()
# In case of debugging, just dump the sheet we've been working on.
if
dataset
.
args
.
verbosity
>
3
:
efsa_combined
.
mcra
.
dump
(
os
.
path
.
join
(
dataset
.
report
.
file
.
directory
,
'dump.xlsx'
))
dataset
.
mismatches
.
file
.
directory
,
'dump.xlsx'
))
#############################################################################
# Phase 4. Report about the data.
...
...
@@ -328,7 +328,7 @@ mismatch_table_string = report_sheet[
header
=
[
'Matrix FoodEx2 Code'
,
'Matrix Code Interpreted'
,
'Matrix Code'
,
'KeyFacets Code'
,
'KeyFacets Interpreted'
,
'Number of Matrix FoodEx2 Codes'
,
'Number of KeyFacets Codes'
]
dataset
.
report
.
sheet
=
report_sheet
[
header
]
dataset
.
mismatches
.
sheet
=
report_sheet
[
header
]
#
# We also need some further text reporting:
# Let's make a new column of the combination
...
...
@@ -356,7 +356,7 @@ double_types = mismatch_table.groupby(
# report_sheet = mismatch_table.groupby(
# ['KeyFacets Code', 'Matrix FoodEx2 Code']).size()
# #
dataset
.
report
.
report
=
r
'''
dataset
.
mismatches
.
report
=
r
'''
CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
...
...
@@ -369,16 +369,16 @@ Conversion run details
for
data
in
dataset
:
if
data
.
direction
==
'Input'
:
dataset
.
report
.
report
=
textwrap
.
indent
(
data
.
report
,
mcra
.
PY_INDENT
)
dataset
.
mismatches
.
report
=
textwrap
.
indent
(
data
.
report
,
PY_INDENT
)
for
datasetname
in
dataset
.
list
:
# Bit of a hack, figure out later how this can be properly done.
if
getattr
(
dataset
,
datasetname
).
direction
==
'Output'
\
and
datasetname
!=
'report'
:
dataset
.
report
.
report
=
textwrap
.
indent
(
getattr
(
dataset
,
datasetname
).
report
,
mcra
.
PY_INDENT
)
dataset
.
mismatches
.
report
=
textwrap
.
indent
(
getattr
(
dataset
,
datasetname
).
report
,
PY_INDENT
)
dataset
.
report
.
report
=
r
'''
dataset
.
mismatches
.
report
=
r
'''
EFSA Excel input details
========================
...
...
@@ -425,15 +425,5 @@ Substance conversion duplicates
'''
+
double_types
.
to_markdown
(
index
=
False
)
+
r
'''
'''
dataset
.
references
.
close
()
dataset
.
report
.
save
()
# Save this also to the dataset sheet.
with
pd
.
ExcelWriter
(
dataset
.
report
.
file
.
path
,
mode
=
'a'
)
as
writer
:
double_types
.
to_excel
(
writer
,
index
=
False
,
sheet_name
=
'Substances'
)
dataset
.
report
.
close
(
auto_report
=
False
,
also_save
=
False
)
dataset
.
close
()
dataset
.
close
(
file_report
=
True
)
Convert-EUProcessingFactorsDB/Convert-Simpler.py
View file @
f0f08a46
#!/usr/bin/python
__version_info__
=
(
'1'
,
'0'
,
'0'
)
__version__
=
'.'
.
join
(
__version_info__
)
#############################################################################
# Phase 0. Initialization
# Doing stuff like parsing arguments, and reading the files.
...
...
@@ -14,7 +18,8 @@ dataset = DataSet(
' Biometris, Wageningen University and Research.'
,
description
=
'Creates an MCRA dataset from the '
+
'Processing Factors database on EFSA Zendono.'
,
epilog
=
'For example: use %(prog)s -v -x for a verbose example.'
)
epilog
=
'For example: use %(prog)s -v -x for a verbose example.'
,
version
=
__version__
)
#
# URL source file
...
...
@@ -118,11 +123,11 @@ efsa = efsa.merge(
dataset
.
processing_factor
.
sheet
=
efsa
[
efsa
[
"idProcessingType"
]
!=
"-"
]
# Before we can use data from the output files (e.g. hash), we first save it
dataset
.
save
()
#
dataset.save()
#############################################################################
# Phase 3. Report about the data.
dataset
.
report
=
r
'''CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
report
=
r
'''CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
------------------------------------------------------
* Script: '''
+
dataset
.
scriptname
+
r
'''
...
...
@@ -133,9 +138,9 @@ dataset.report = r'''CONVERSION REPORT FOR EFSA system FoodEx 2 EXCEL SHEET
'''
for
data
in
dataset
:
if
data
.
direction
==
'Output'
:
dataset
.
report
+=
textwrap
.
indent
(
data
.
get_report
(),
PY_INDENT
)
report
+=
textwrap
.
indent
(
data
.
get_report
(),
PY_INDENT
)
dataset
.
report
+=
r
'''
report
+=
r
'''
EFSA Excel input details
========================
...
...
@@ -143,7 +148,13 @@ EFSA Excel input details
'''
for
data
in
dataset
:
if
data
.
direction
==
'Input'
:
dataset
.
report
+=
data
.
get_report
()
report
+=
data
.
get_report
()
#
# Writing everthing that's left now
dataset
.
close
()
# Here's a self generated report
# dataset.report = report
# dataset.close()
# Here's an auto generated report
# Uncomment lines 121 and 149, 150 and comment line below
# to get original report back.
dataset
.
close
(
file_report
=
True
)
Convert-EUProcessingFactorsDB/Example/.gitkeep
deleted
100644 → 0
View file @
74a7ea2f
Convert-EUProcessingFactorsDB/Input/.gitignore
0 → 100644
View file @
f0f08a46
# Ignore everything in this directory
*
# Except this file
!.gitignore
# The default files
!FoodCompositions.xlsx
!FoodTranslations.csv
!ProcessingTypes.csv
!ProcTypeTranslations.csv
!SubstanceTranslations.csv
\ No newline at end of file
Convert-EUProcessingFactorsDB/Input/.gitkeep
deleted
100644 → 0
View file @
74a7ea2f
Convert-EUProcessingFactorsDB/
Example
/FoodCompositions.xlsx
→
Convert-EUProcessingFactorsDB/
Input
/FoodCompositions.xlsx
View file @
f0f08a46
File moved
Convert-EUProcessingFactorsDB/
Example
/FoodTranslations.csv
→
Convert-EUProcessingFactorsDB/
Input
/FoodTranslations.csv
View file @
f0f08a46
File moved
Convert-EUProcessingFactorsDB/
Example
/ProcTypeTranslations.csv
→
Convert-EUProcessingFactorsDB/
Input
/ProcTypeTranslations.csv
View file @
f0f08a46
File moved
Convert-EUProcessingFactorsDB/
Example
/ProcessingTypes.csv
→
Convert-EUProcessingFactorsDB/
Input
/ProcessingTypes.csv
View file @
f0f08a46
File moved
Convert-EUProcessingFactorsDB/
Example
/SubstanceTranslations.csv
→
Convert-EUProcessingFactorsDB/
Input
/SubstanceTranslations.csv
View file @
f0f08a46
File moved
Convert-EUProcessingFactorsDB/dataconversion.py
View file @
f0f08a46
...
...
@@ -15,6 +15,10 @@ import math
import
sys
import
textwrap
import
getpass
import
re
__version_info__
=
(
'0'
,
'9'
,
'2'
)
__version__
=
'.'
.
join
(
__version_info__
)
# For debugging purposes
# from objbrowser import browse
...
...
@@ -114,6 +118,16 @@ class McraAccessor:
elif
ext
==
'.xlsx'
:
self
.
_obj
.
to_excel
(
filename
,
sheet_name
=
'Dump'
,
index
=
False
)
def
dup_reggroups
(
self
,
column
,
regex
):
temp_col
=
column
+
'__temp__'
dups
=
self
.
_obj
[
column
].
str
.
extractall
(
regex
)
dups
[
temp_col
]
=
dups
.
values
.
tolist
()
dups
=
dups
.
reset_index
(
level
=
[
1
])
self
.
_obj
=
self
.
_obj
.
join
(
dups
[
temp_col
]).
explode
(
temp_col
).
reset_index
(
drop
=
True
)
self
.
_obj
.
loc
[(
self
.
_obj
[
temp_col
].
notna
()),
column
]
=
self
.
_obj
[
temp_col
]
self
.
_obj
.
drop
(
columns
=
temp_col
,
inplace
=
True
)
return
self
.
_obj
class
DataFile
:
'''
...
...
@@ -251,10 +265,8 @@ class DataSheet:
report
=
''
if
self
.
direction
==
'Input'
or
\
(
self
.
direction
==
'Output'
and
self
.
closed
):
report
+=
'* {dir} file: {file}
\n
'
.
format
(
dir
=
self
.
direction
,
file
=
os
.
path
.
split
(
self
.
file
.
path
)[
1
])
report
+=
textwrap
.
indent
(
'* [{path}]({path})
\n
'
.
format
(
path
=
self
.
file
.
path
),
PY_INDENT
)
filename
=
os
.
path
.
split
(
self
.
file
.
path
)[
1
]
report
+=
f
'*
{
self
.
direction
}
file: [
{
filename
}
](
{
filename
}
)
\n
'
report
+=
textwrap
.
indent
(
'* {props}
\n
'
.
format
(
props
=
self
.
properties
),
PY_INDENT
)
report
+=
textwrap
.
indent
(
...
...
@@ -360,44 +372,64 @@ class DataSheet:
# # Save report
# with open(self.file.reportpath, 'w+') as f:
# f.write(self.report)
print
(
'Output file: {file}; {props}'
.
format
(
file
=
self
.
file
.
path
,
props
=
self
.
properties
))
if
'-v'
in
sys
.
argv
or
'--verbosity'
in
sys
.
argv
:
print
(
f
'Output file:
{
self
.
file
.
path
}
;
{
self
.
properties
}
'
)
class
DataSet
:
def
__init__
(
self
,
opening
=
None
,
description
=
None
,
epilog
=
None
):
def
__init__
(
self
,
opening
=
None
,
description
=
None
,
epilog
=
None
,
version
=
False
):
self
.
args
=
None
self
.
list
=
[]
# Whether or not to create a zip file
self
.
zip
=
None
# The report for the entire dataset
self
.
report
=
''
self
.
runtime
=
datetime
.
now
().
strftime
(
'%H:%M:%S, %d %b %Y'
)
self
.
runcommand
=
' '
.
join
(
sys
.
argv
)
self
.
runarguments
=
' '
.
join
(
sys
.
argv
[
1
:])
self
.
usedarguments
=
None
self
.
scriptname
=
os
.
path
.
split
(
sys
.
argv
[
0
])[
1
]
md5_hash
=
hashlib
.
md5
()
with
open
(
sys
.
argv
[
0
],
"rb"
)
as
f
:
# Read and update hash in chunks of 4K
for
byte_block
in
iter
(
lambda
:
f
.
read
(
4096
),
b
""
):
md5_hash
.
update
(
byte_block
)
self
.
scripthash
=
md5_hash
.
hexdigest
()
m
=
re
.
match
(
'(.*)\-(?P<noun>.*)\.py'
,
self
.
scriptname
)
if
m
:
self
.
scriptnoun
=
m
.
group
(
'noun'
)
else
:
self
.
scriptnoun
=
self
.
scriptname
.
replace
(
'.py'
,
''
)
self
.
runuser
=
getpass
.
getuser
()
self
.
parser
=
ArgumentParser
(
description
=
description
,
epilog
=
epilog
)
report
=
'Output
\\
Report.md'
self
.
parser
.
add_argument
(
'-r'
,
'--report'
,
nargs
=
'?'
,
const
=
'Output
\\
Report.md'
,
const
=
report
,
default
=
report
,
help
=
'Creates a report file (default: %(const)s).'
)
# The verbosity argument will accept: -v, or -vv, -vvv etc.
# Set default to 1, so that basic output will always appear.
self
.
parser
.
add_argument
(
'-v'
,
'--verbosity'
,
help
=
"Show verbose output"
,
action
=
"count"
,
default
=
1
)
self
.
parser
.
add_argument
(
'-x'
,
'--example'
,
action
=
'store_const'
,
const
=
'Example'
,
help
=
'Uses input files from the %(const)s subdir.'
)
action
=
"count"
,
default
=
0
)
# self.parser.add_argument(
# '-x', '--example', action='store_const', const='Example',
# help='Uses input files from the %(const)s subdir.')
if
version
:
self
.
version
=
version
else
:
self
.
version
=
__version__
zip
=
f
'Build
\\
{
self
.
scriptnoun
}
.
{
version
}
.zip'
self
.
parser
.
add_argument
(
'-z'
,
'--zip'
,
nargs
=
'?'
,
const
=
'Output
\\
Output.
zip
'
,
'-z'
,
'--zip'
,
nargs
=
'?'
,
const
=
zip
,
default
=
zip
,
help
=
'Creates a zip file %(const)s containing all output.'
+
' (default: %(const)s).'
)
if
'-v'
in
sys
.
argv
or
'--verbosity'
in
sys
.
argv
:
print
(
opening
)
self
.
list
=
[]
# Whether or not to create a zip file
self
.
zip
=
None
# The report for the entire dataset
self
.
report
=
''
self
.
runtime
=
datetime
.
now
().
strftime
(
'%H:%M:%S, %d %b %Y'
)
self
.
runcommand
=
' '
.
join
(
sys
.
argv
)
self
.
runarguments
=
' '
.
join
(
sys
.
argv
[
1
:])
self
.
scriptname
=
os
.
path
.
split
(
sys
.
argv
[
0
])[
1
]
self
.
runuser
=
getpass
.
getuser
()
# It is usefull to be able to iterate over all the datasheets.
# Basically, avoid using .list. in all DataSet references.
...
...
@@ -466,6 +498,7 @@ class DataSet:
def
init
(
self
):
# Initializes the command line parameters
self
.
args
=
self
.
parser
.
parse_args
()
self
.
usedarguments
=
self
.
args
.
__dict__
for
datasetname
in
self
.
list
:
dataset
=
getattr
(
self
,
datasetname
)
...
...
@@ -473,15 +506,11 @@ class DataSet:
datasetfilename
=
dataset
.
file
.
default_name
else
:
datasetfilename
=
getattr
(
self
.
args
,
datasetname
+
'_file'
)
self
.
usedarguments
[
datasetname
+
'_file'
]
=
datasetfilename
if
dataset
.
direction
==
'Input'
:
if
self
.
args
.
example
:
dataset
.
file
.
suggest
(
datasetfilename
,
force_dir
=
self
.
args
.
example
)
else
:
dataset
.
file
.
suggest
(
datasetfilename
)
dataset
.
file
.
suggest
(
datasetfilename
)
if
urlparse
(
dataset
.
file
.
suggested
).
netloc
:
if
(
not
dataset
.
file
.
exist
)
\
or
((
dataset
.
checksum
is
not
None
)
...
...
@@ -522,11 +551,22 @@ class DataSet:
base
,
ext
=
os
.
path
.
splitext
(
datasetfilename
)
dataset
.
file
.
suggest
(
datasetfilename
)
dataset
.
update_properties
()
if
self
.
args
.
zip
:
# Create a zip file containing everything
basezip
,
extzip
=
os
.
path
.
splitext
(
self
.
args
.
zip
)
# Setting self.zip indicates creating a zip file
self
.
zip
=
basezip
+
'.zip'
os
.
makedirs
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
dataset
.
file
.
path
)),
exist_ok
=
True
)
# Make sure we can create the report
os
.
makedirs
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
self
.
args
.
report
)),
exist_ok
=
True
)
# Always create a zip file containing everything
# First make sure the directory exists
zippath
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
self
.
args
.
zip
))
os
.
makedirs
(
zippath
,
exist_ok
=
True
)
basezip
,
extzip
=
os
.
path
.
splitext
(
self
.
args
.
zip
)
# Setting self.zip indicates creating a zip file
self
.
zip
=
basezip
+
'.zip'
def
save
(
self
):
for
data
in
self
:
...
...
@@ -545,11 +585,30 @@ class DataSet:
report_content
+=
textwrap
.
indent
(
f
'* Command line:
{
self
.
runcommand
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Arguments:
{
self
.
runarguments
}
\n
'
,
PY_INDENT
)
f
'* Filename:
{
self
.
scriptname
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Command line Arguments:
{
self
.
runarguments
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
'* Arguments executed:
\n
'
,
PY_INDENT
)
for
key
,
value
in
self
.
usedarguments
.
items
():
if
value
is
None
:
report_content
+=
textwrap
.
indent
(
f
'* --
{
key
}
\n
'
,
2
*
PY_INDENT
)
else
:
report_content
+=
textwrap
.
indent
(
f
'* --
{
key
}
{
value
}
\n
'
,
2
*
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Hash:
{
self
.
scripthash
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Executed at:
{
self
.
runtime
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Executed by:
{
self
.
runuser
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Version:
{
self
.
version
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* Depends upon module:
{
__name__
}
\n
'
,
PY_INDENT
)
report_content
+=
textwrap
.
indent
(
f
'* With version:
{
__version__
}
\n
'
,
2
*
PY_INDENT
)
for
data
in
self
:
if
data
.
direction
==
'Input'
:
report_content
+=
data
.
get_report
()
...
...
Convert-EUProcessingFactorsDB/mcra.py
deleted
100644 → 0
View file @
74a7ea2f
from
argparse
import
ArgumentParser
,
SUPPRESS
import
pandas
as
pd
from
datetime
import
datetime
from
urllib.parse
import
urlparse
import
os
# path, mkdir, walk
import
time
# ctime
import
types
import
uuid
import
zipfile
import
requests
import
hashlib
import
numpy
as
np
import
math
import
sys
import
textwrap
# For debugging purposes
# from objbrowser import browse
PY_INDENT
=
' '
@
pd
.
api
.
extensions
.
register_dataframe_accessor
(
'mcra'
)
class
McraAccessor
:
'''
This is an extension of the panda object model.
Some often used functions are added here.
'''
def
__init__
(
self
,
pandas_obj
):
self
.
_obj
=
pandas_obj
def
here_concat
(
*
args
):
'''
To easily join two columns
'''
strs
=
[
str
(
arg
)
for
arg
in
args
if
not
pd
.
isnull
(
arg
)]
return
'-'
.
join
(
strs
)
if
strs
else
np
.
nan
self
.
concat
=
np
.
vectorize
(
here_concat
)
def
copycolumn
(
self
,
columnnames
):
'''
To easily copy a bunch of columns
'''
for
fromcol
,
tocol
in
columnnames
.
items
():
self
.
_obj
[
tocol
]
=
self
.
_obj
[
fromcol
]
def
addcolumn
(
self
,
columnnames
):
'''
To easily add a bunch of empty columns
'''
for
col
in
columnnames
:
self
.
_obj
[
col
]
=
''
def
splitjoin
(
self
,
name
,
split
,
join
,
split_sep
=
'-'
,
right_split
=
True
,
join_sep
=
'-'
):
'''
Splits a column, and then joins the result with another column
'''
# Due to the SettingWithCopyWarning we do it a bit cumbersome
df
=
pd
.
DataFrame
()
df
[
join
]
=
self
.
_obj
[
join
]
if
right_split
:
df
[
split
]
=
self
.
_obj
[
split
].
str
.
rsplit
(
split_sep
,
n
=
1
).
str
[
1
]
else
:
df
[
split
]
=
self
.
_obj
[
split
].
str
.
rsplit
(
split_sep
,
n
=
1
).
str
[
0
]
df
[
name
]
=
df
.
loc
[:,
(
join
,
split
)].
apply
(
lambda
x
:
'-'
.
join
(
x
.
dropna
()),
axis
=
1
)
df
=
df
.
drop
([
join
,
split
],
axis
=
1
)
# Not ideal yet, but slightly better than it used to be....
self
.
_obj
=
self
.
_obj
.
merge
(
df
,
left_index
=
True
,
right_index
=
True
)
return
self
.
_obj
def
join
(
self
,
name
,
join_left
,
join_right
,
sep
=
'-'
):
'''
joins with another column
'''
# Due to the SettingWithCopyWarning we do it a bit cumbersome
df
=
pd
.
DataFrame
()
df
[[
join_left
,
join_right
]]
=
self
.
_obj
[[
join_left
,
join_right
]]
df
[
name
]
=
df
.
loc
[:,
(
join_left
,
join_right
)].
apply
(
lambda
x
:
sep
.
join
(
x
.
dropna
()),
axis
=
1
)
df
=
df
.
drop
([
join_left
,
join_right
],
axis
=
1
)
# Not ideal yet, but slightly better than it used to be....
self
.
_obj
=
self
.
_obj
.
merge
(
df
,
left_index
=
True
,
right_index
=
True
)
return
self
.
_obj
def
dump
(
self
,
filename
):
'''
For debugging purposes, to dump a file from memory a bit more easily
'''
base
,
ext
=
os
.
path
.
splitext
(
filename
)
print
(
'Dump file : {file}.'
.
format
(
file
=
filename
))
if
ext
==
'.csv'
:
self
.
_obj
.
to_csv
(
path_or_buf
=
filename
,
index
=
False
)
elif
ext
==
'.tsv'
:
self
.
_obj
.
to_csv
(
path_or_buf
=
filename
,
index
=
False
,
sep
=
'
\t
'
)
elif
ext
==
'.xlsx'
:
self
.
_obj
.
to_excel
(
filename
,
sheet_name
=
'Dump'
,
index
=
False
)
class
DataFile
:
'''
A class to work with the files more streamlined.
Contains technical details just to use the files in a simple manner.
:param default_name: The default name for the file, can also
determine the output naam/sheet.
:param default_dir: The default directory in which to place the file
:param checksum: If given, a file can be checked/reused
:param necessary: Whether the file is necessary or not.
'''
def
__init__
(
self
,
default_name
,
default_dir
,
necessary
=
True
):
self
.
default_name
=
default_name
self
.
default_base
=
os
.
path
.
splitext
(
self
.
default_name
)[
0
]
self
.
default_dir
=
default_dir
self
.
path
=
None
self
.
directory
=
None
self
.
reportpath
=
None
self
.
zippath
=
None
self
.
suggested
=
None
self
.
exist
=
False
self
.
modified
=
''
self
.
extension
=
None
self
.
size
=
0
self
.
size_string
=
''
self
.
hash
=
''
self
.
hash_short
=
''
self
.
checksum
=
None
self
.
necessary
=
necessary
def
update
(
self
):
'''
Updates file properties, e.g. for output files.
'''
if
os
.
path
.
exists
(
self
.
path
)
and
os
.
path
.
isfile
(
self
.
path
):
self
.
exist
=
True
self
.
modified
=
time
.
ctime
(
os
.
path
.
getmtime
(
self
.
path
))
self
.
size
=
os
.
path
.
getsize
(
self
.
path
)
self
.
size_string
=
self
.
__converttoprefix
(
self
.
size
)
self
.
hash
=
str
(
self
.
__md5_hash
())
self
.
hash_short
=
self
.
hash
[
0
:
8
]
def
__converttoprefix
(
self
,
bytes
):
'''
Private function to have some nice formatting of filesizes
'''
if
bytes
<=
1024
:
return
'{0:.0f} B'
.
format
(
bytes
)
else
:
power
=
math
.
floor
(
math
.
log
(
bytes
,
1024
))
factor
=
math
.
pow
(
1024
,
power
)