Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
PPS
PPS Data Management
Commits
30ebdaa3
Commit
30ebdaa3
authored
Jul 02, 2020
by
Languillaume, Antoine
Browse files
Delete useless scripts
Functions and previous metadata extractrion.
parent
94be7951
Changes
2
Hide whitespace changes
Inline
Side-by-side
scripts/extract_metadata_prev.R
deleted
100644 → 0
View file @
94be7951
library
(
openxlsx
)
path.data.raw
<-
"./data/raw"
data.list.raw
<-
list.files
(
path.data.raw
)
path.data.proc
<-
"./data/processed/"
data.list.proc
<-
list.files
(
path.data.proc
)
# wb = data.list.raw[2]
for
(
wb
in
data.list.raw
){
if
(
grepl
(
".xlsx"
,
wb
)){
wb.name
=
gsub
(
".xlsx"
,
""
,
wb
)
file
=
paste
(
path.data.raw
,
wb
,
sep
=
"/"
)
sheets
=
getSheetNames
(
file
)
}
if
(
grepl
(
".csv"
,
wb
)){
wb.name
=
gsub
(
".csv"
,
""
,
wb
)
file
=
paste
(
path.data.raw
,
wb
,
sep
=
"/"
)
sheets
=
"data"
}
##create workbook to store data and metadata
meta.wb.name
=
paste
(
wb.name
,
"_metadata"
,
sep
=
""
)
meta.wb
<-
createWorkbook
(
meta.wb.name
)
##store variable names
var.names
=
c
()
unit.vec
=
c
()
sheet.names
=
c
()
# for(st in sheets){
st
=
sheets
[
1
]
if
(
grepl
(
".xlsx"
,
wb
)){
t1
<-
read.xlsx
(
file
,
detectDates
=
T
,
sheet
=
st
,
colNames
=
F
)
}
if
(
grepl
(
".csv"
,
wb
)){
t1
<-
read.csv
(
file
,
header
=
F
,
as.is
=
T
)
}
##skip empty worksheets
if
(
is.null
(
t1
))
next
##count NA to detect header in excel sheet
#na count function that sets "" to NA (for csv)
na.count.fun
=
function
(
x
){
x
[
which
(
x
==
""
)]
=
NA
out
=
sum
(
!
is.na
(
x
))
return
(
out
)
}
na.count
=
apply
(
t1
,
1
,
na.count.fun
)
ncol
=
max
(
na.count
)
head.row
=
which
(
na.count
==
ncol
)[
1
]
##make dataframe from sheet with colum names
colnames
=
as.character
(
t1
[
head.row
,])
###remove NA from names
miss.name
=
which
(
is.na
(
colnames
)
|
colnames
==
"NA"
)
colnames
[
miss.name
]
=
paste
(
"X"
,
1
:
length
(
miss.name
),
sep
=
""
)
###fish units from names using $ ##when units flag that specification of methods and defenitions of measurements (e.g. reported vs measured, dr vs fresh weight.)
colnames
=
lapply
(
colnames
,
function
(
x
)
unlist
(
strsplit
(
x
,
split
=
"\\$"
)))
contains.unit
=
unlist
(
lapply
(
colnames
,
function
(
x
)
length
(
x
)
>
1
))
units
=
rep
(
""
,
length
(
colnames
))
units
[
contains.unit
]
=
unlist
(
lapply
(
colnames
[
contains.unit
],
function
(
x
)
x
[
2
]))
colnames
=
unlist
(
lapply
(
colnames
,
function
(
x
)
x
[
1
]))
new.frame
=
t1
[(
head.row
+1
)
:
nrow
(
t1
),
1
:
length
(
colnames
)]
colnames
(
new.frame
)
=
colnames
var.names
=
c
(
var.names
,
colnames
)
unit.vec
=
c
(
unit.vec
,
units
)
sheet.names
=
c
(
sheet.names
,
rep
(
st
,
length
(
colnames
)))
addWorksheet
(
meta.wb
,
st
)
writeData
(
meta.wb
,
st
,
new.frame
)
# }
##make data frame with workbook name, sheet names and variables and add column for definitions
var.frame
=
data.frame
(
"workbook"
=
wb.name
,
"sheet"
=
sheet.names
,
"variable"
=
var.names
,
"unit"
=
unit.vec
,
"definition"
=
NA
,
"unique identifier"
=
0
,
"personal information"
=
0
,
stringsAsFactors
=
F
)
#set up metadata
#mid.names=c("ID","Country","Name region","Name site","Minimum latitude","Maximum latitude", "Minimum longitude","Maximum longitude","Experiment/survey","Type of experiment","Type of survey","On-farm/on-station","Crops","Animals","Soil type")
#meta.data.frame=read.xlsx(paste(path.data.op,"Required_Metadata.xlsx",sep="/"))
field.vec
=
c
(
"Data ID"
,
"Official title of the dataset"
,
"Project name"
,
"Description of project"
,
"Author"
,
"Author ID(ORCID)"
,
"Contributor(s)"
,
"Subject matter of research/Vocabulary"
,
"Data origin"
,
"Funder(s) or sponsor(s) of project"
,
"creation date (m/d/yyyy)"
,
"Embargo end date"
,
"Citation"
,
"keywords (AGROVOC)"
,
"Country(ies) covered"
,
"Point longitude coord. in Dec. Degrees "
,
"Agro-Ecological Zone(s)(FAO) covered"
,
"Years covered by data"
,
"Crops covered by data"
,
"Animals covered by data"
,
"Start date of data collection "
,
"End date of data collection "
,
"License (default=CC-BY)"
,
"Permission given by email"
,
"Rights"
,
"Contact email"
)
field_name.vec
=
c
(
"data.id"
,
"data.title"
,
"project.name"
,
"project.description"
,
"author"
,
"orcid"
,
"contributors"
,
"subject.research"
,
"data.origin"
,
"donor"
,
"date.creation"
,
"date.embargo"
,
"citation"
,
"keywords.agrovoc"
,
"countries"
,
"longitude"
,
"aez"
,
"years"
,
"crops"
,
"animals"
,
"date.collect.start"
,
"date.collect.end"
,
"licence"
,
"permission"
,
"rights"
,
"contact.mail"
)
meta.data.frame
=
data.frame
(
field
=
field.vec
,
field_name
=
field_name.vec
,
values
=
NA
,
stringsAsFactors
=
F
)
###check if workbook with variable definitions and metadata already present in processed and make sure not overwritten
exist.wb
=
data.list.proc
[
which
(
data.list.proc
==
paste
(
wb.name
,
"_metadata.xlsx"
,
sep
=
""
))]
if
(
length
(
exist.wb
)
>
0
){
##read.data and extract
exist.file
=
paste
(
path.data.proc
,
exist.wb
,
sep
=
"/"
)
existing.varframe
=
read.xlsx
(
exist.file
,
detectDates
=
T
,
sheet
=
"variable definitions"
)
existing.meta.data.frame
=
read.xlsx
(
exist.file
,
detectDates
=
T
,
sheet
=
"meta data"
)
existing.meta.data.frame.id
=
apply
(
existing.meta.data.frame
[,
1
:
2
],
1
,
paste
,
collapse
=
";"
)
meta.data.frame.id
=
apply
(
meta.data.frame
[,
1
:
2
],
1
,
paste
,
collapse
=
";"
)
existing.varframe.id
=
apply
(
existing.varframe
[,
1
:
3
],
1
,
paste
,
collapse
=
";"
)
var.frame.id
=
apply
(
var.frame
[,
1
:
3
],
1
,
paste
,
collapse
=
";"
)
##overwrite new frames with existing values
meta.data.frame
[
na.omit
(
match
(
existing.meta.data.frame.id
,
meta.data.frame.id
)),]
<-
existing.meta.data.frame
[
na.omit
(
match
(
meta.data.frame.id
,
existing.meta.data.frame.id
)),]
var.frame
[
na.omit
(
match
(
existing.varframe.id
,
var.frame.id
)),]
<-
existing.varframe
[
na.omit
(
match
(
var.frame.id
,
existing.varframe.id
)),]
##rbind additional rows
meta.data.frame
=
rbind
(
meta.data.frame
,
existing.meta.data.frame
[
which
(
!
existing.meta.data.frame.id
%in%
meta.data.frame.id
),])
var.frame
=
rbind
(
var.frame
,
existing.varframe
[
which
(
!
existing.varframe.id
%in%
var.frame.id
),])
}
##now add metadata to metadata workbook
addWorksheet
(
meta.wb
,
"meta data"
)
addWorksheet
(
meta.wb
,
"variable definitions"
)
writeData
(
meta.wb
,
"meta data"
,
meta.data.frame
)
writeData
(
meta.wb
,
"variable definitions"
,
var.frame
)
##now save workbook
saveWorkbook
(
meta.wb
,
file
=
paste
(
path.data.proc
,
meta.wb.name
,
".xlsx"
,
sep
=
""
),
overwrite
=
T
)
##later do not overrwrite but make sure merged/aggregated
}
scripts/funcs.R
deleted
100644 → 0
View file @
94be7951
##### Minimal Reproducible Analysis Example
##### Functions
#' Detect unique identifier columns
#'
#' Given a data.frame properly setup with units appended to measured variables,
#' this function will detect unique identifiers in the data.frame and return
#' a vector of 0 and 1s:
#' - 0: column is NOT an unique identifier.
#' - 1: column is an unique identifier.
#'
#' @param df A data.frame with with units appended to measured variables.
#' variable name and unit are separated by a dollar sign: variable$unit.
#'
#' @return An integer vector of 0 and 1 indicating if a given column
#' is a unique identifier or not.
detect_unique_ID
<-
function
(
df
)
{
facts
<-
df
[
grep
(
".+\\$.+"
,
names
(
df
),
invert
=
TRUE
)]
quants_names
<-
names
(
d
)[
!
(
names
(
d
)
%in%
names
(
facts
))]
quants
<-
rep
(
0
,
length
(
quants_names
))
names
(
quants
)
<-
quants_names
facts_is_ID
<-
purrr
::
map_int
(
facts
,
~
dplyr
::
n_distinct
(
.x
)
==
length
(
.x
))
int_ID
<-
c
(
facts_is_ID
,
quants
)
int_ID
<-
int_ID
[
names
(
df
)]
return
(
int_ID
)
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment