Commit 5eedcbfb authored by bob's avatar bob
Browse files

Refactor metadata extraction script

- Break the logic into adequately named functions
- Add documentation using Roxygen syntax
parent a5ba53e9
### -------------------------------------------------------------------------------------------- ###
# PPS Data Management Plan companion script to create excel workbooks including proper meta data #
### -------------------------------------------------------------------------------------------- ###
### -------------------------------------------------------------------------- ###
# PPS Data Management Plan companion script to create excel workbooks
# including proper meta data
### -------------------------------------------------------------------------- ###
##### Required package(s) --------------------------------------------------------------------------
# To write data to an excel workbook
library(openxlsx)
### Functions ------------------------------------------------------------------ #
##### Meta data template ---------------------------------------------------------------------------
#' Create template of metadata sheet
#'
#' Once the data process the user can manually update the metadata sheet
#' in the resulting excel workbook.
#'
#' @return A data.frame with columns field, field_name and values.
#' This last is to be filled manually.
instantiate_meta_data_template <- function() {
field <- c("Data ID", "Official title of the dataset", "Project name",
field <- c("Data ID", "Official title of the dataset", "Project name",
"Description of project", "Author", "Author ID(ORCID)",
"Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project",
"creation date (m/d/yyyy)", "Embargo end date", "Citation",
......@@ -20,7 +26,7 @@ field <- c("Data ID", "Official title of the dataset", "Project name",
"Animals covered by data", "Start date of data collection", "End date of data collection",
"License (default=CC-BY)", "Permission given by email", "Rights", "Contact email")
field_name <- c("data.id", "data.title", "project.name", "project.description",
field_name <- c("data.id", "data.title", "project.name", "project.description",
"author", "orcid", "contributors", "subject.research",
"data.origin", "donor", "date.creation", "date.embargo",
"citation", "keywords.agrovoc", "countries", "longitude",
......@@ -28,61 +34,137 @@ field_name <- c("data.id", "data.title", "project.name", "project.description",
"date.collect.start", "date.collect.end", "licence", "permission",
"rights", "contact.mail")
meta_data <- data.frame(field = field,
meta_data <- data.frame(field = field,
field_name = field_name,
values = NA,
stringsAsFactors = FALSE)
return(meta_data)
}
##### Raw file processing --------------------------------------------------------------------------
#' Extract file extension
#'
#' @param file A file object yielded by `read_raw_data_file()`.
#' @return A character indicating the file extension.
extract_file_extension <- function(file) {
regmatches(file, regexpr("(?<=\\.)[a-z]+$", file, perl = TRUE))
}
## Make a list of all files in ./data/raw
raw_data_files <- list.files("./data/raw", full.names = TRUE)
### Iterate through the file list
for (file in raw_data_files) {
#' Make a list of raw data files in projects
#'
#' @param path A character string indicating the relative path
#' to the folder where raw data is stored.
#' By default raw data is assumed to be in "data/raw/".
#' @return A list of data.frame. Each data.frame as two columns: path and extension.
#' Respectively holding the relative path to the file and the file extension.
#' Later this list of data.frame is referred to as a *file object*.
list_raw_data_files <- function(path = "data/raw/") {
paths <- list.files(path, full.names = TRUE)
extensions <- vapply(paths, extract_file_extension,
FUN.VALUE = character(1), USE.NAMES = FALSE)
raw_data_files <- data.frame(path = paths,
extension = extensions)
raw_data_files <- split(raw_data_files, 1:nrow(raw_data_files))
return(raw_data_files)
}
print(sprintf("Processing: %s", file))
## Extract the file extension
file_extension <- regmatches(file, regexpr("(?<=\\.)[a-z]+$", file, perl = TRUE))
## Extract file name and eventually worksheet names
if (file_extension == "csv") {
file_name <- gsub("\\.csv$", "", basename(file))
sheets <- "data"
} else if (file_extension == "xlsx") {
file_name <- gsub("\\.xlsx$", "", basename(file))
sheets <- getSheetNames(file)
} else {
warning(sprintf("File extension not supported: %s, file skipped.", file_extension))
next
}
## Create the final name of the workbook which will hold the file data and metadata
meta_wb_name <- paste0(file_name, "_metadata")
# ... and a workbook template
meta_wb <- createWorkbook(meta_wb_name)
#' Extract file name
#' file name <=> string before file extension
#' @param file A file object yielded by read_raw_data_file().
#' @param supported_extensions A character vector of supported extensions.
#' @return A character corresponding to the string before file extension.
extract_file_name <- function(file, supported_extensions = c("csv", "xlsx")) {
extension_pattern <- paste0(supported_extensions, collapse = "|")
pattern <- sprintf("\\.(%s)$", extension_pattern)
gsub(pattern, "", basename(file$path))
}
## Create variables to hold relevant values to extract from the file
variable_col <- c() # variable names
unit_col <- c() # units
sheet_col <- c() # worksheet name
#' Given a file extract the data sheets
#'
#' @param file A file object yielded by read_raw_data_file().
#' @return Depending on the file$extension:
#' * if file$extension is "csv", a character string "data"
#' * if file$extension is "xlsx", a character vector with all sheet names.
extract_data_sheets <- function(file) {
switch(file$extension,
"csv" = "data",
"xlsx" = openxlsx::getSheetNames(file$path)
)
}
### Iterate through the worksheets
for (sheet in sheets) {
#' Instantiate a workbook object
#'
#' This is the most important function of the script. It defines the one major
#' object which is passed from function to function and encapsulate all required data.
#' The workbook object if the R skeleton of the future excel workbook that will contain
#' data, meta data and variable definitions.
#'
#' @param file A file object yielded by read_raw_data_file().
#' @param output_path A character string. Path where final workbook will written.
#' Default to "data/processed/"
#'
#' @return A list later referred to as *workbook object*, containing the
#' following fields:
#' * original_name [character]: name of the original file without extension.
#' * extension [character]: original file extension.
#' * path [charcater]: entire path including file name where workbook should be saved.
#' * workbook [openxlsx Workbook Object]
#' * variables [named list]: holding 3 fields: name, unit and sheet used to construct
#' variable definitions.
#' * sheets [character vector]: name of the workbook sheets to be processed.
instantiate_workbook <- function(file, output_path = "data/processed/") {
file_name <- extract_file_name(file)
workbook_name <- paste0(file_name, "_metadata")
full_name <- paste0(workbook_name, ".xlsx")
variables <- list(name = vector("character"),
unit = vector("character"),
sheet = vector("character"))
workbook <- list(original_name = file_name,
extension = file$extension,
path = file.path(output_path, full_name),
wb = createWorkbook(workbook_name),
variables = variables,
sheets = extract_data_sheets(file))
return(workbook)
}
#' Read data from a given sheet
#' @param file A file object yielded by read_raw_data_file().
#' @param sheet A character indicating sheet name.
#' @return A data.frame
read_data_sheet <- function(file, sheet) {
## Properly read in the file depending on its extension.
# The first line is skipped. Otherwise the special place holder '$'
# separating variable name and unit might get coerced to a '.'
if (file_extension == "csv") {
#dat <- read.csv(file, header = FALSE, fileEncoding = "UTF-8-BOM")
dat <- read.csv(file, skip = 1)
}
if (file_extension == "xlsx") {
dat <- read.xlsx(file, sheet = sheet, colNames = FALSE)
}
if (is.null(dat)) next
switch(file$extension,
"csv" = read.csv(file$path, header = FALSE, fileEncoding = "UTF-8-BOM"),
"xlsx" = read.xlsx(file$path, sheet = sheet, colNames = FALSE)
)
}
#' Process a given data sheet
#'
#' @param file A file object yielded by read_raw_data_file().
#' @param sheet A character indicating sheet name.
#'
#' @return A list later referred to as a *processed sheet object* containing
#' the following fields:
#' * name [character]: sheet name.
#' * variables [character vector]: variable names.
#' * units [character vector]: variables units.
#' * data [data.frame]: holding sheet data.
process_sheet <- function(file, sheet) {
dat <- read_data_sheet(file, sheet)
## From the first row of the file, extract...
header <- strsplit(as.character(dat[1, ]), "\\$")
......@@ -94,65 +176,142 @@ for (file in raw_data_files) {
# and delete the first row
dat <- dat[-1, ]
processed_sheet <- list(
name = sheet,
variables = variables,
units = units,
data = dat
)
return(processed_sheet)
}
#' Update workbook object with a given processed sheet
#' @param workbook A work book object.
#' @param processed_sheet A processed sheet object.
#' @return A workbook object updated with data from `processed_sheet`.
update_workbook_sheet_data <- function(workbook, processed_sheet) {
## When several sheets are present in your workbook
# append the relevant values one after the other
variable_col <- c(variable_col, variables)
unit_col <- c(unit_col, units)
sheet_col <- c(sheet_col, rep(sheet, length(variables)))
workbook$variables$name <- c(workbook$variables$name, processed_sheet$variables)
workbook$variables$unit <- c(workbook$variables$unit, processed_sheet$units)
sheet_vector <- rep(processed_sheet$name, length(processed_sheet$variables))
workbook$variables$sheet <- c(workbook$variables$sheet, sheet_vector)
## Add data sheet to the workbook template
addWorksheet(meta_wb, sheet)
# and write the data
writeData(meta_wb, sheet, dat)
addWorksheet(workbook$wb, processed_sheet$name)
writeData(workbook$wb, processed_sheet$name, processed_sheet$data)
return(workbook)
}
}
## Create the variable definition data.frame for that file
var_definitions <- data.frame("workbook" = file_name,
"sheet" = sheet_col,
"variable" = variable_col,
"unit" = unit_col,
#' Create variable definition data.frame
#' @param workbook A work book object.
#' @return A data.frame holding variable and units to be updated manually later on.
create_variable_definition_df <- function(workbook) {
var_definitions <- data.frame("workbook" = workbook$original_name,
"sheet" = workbook$variables$sheet,
"variable" = workbook$variables$name,
"unit" = workbook$variables$unit,
"definition" = NA,
"unique identifier" = 0,
"personal information" = 0,
stringsAsFactors = FALSE)
return(var_definitions)
}
## Create the full name (relative path included)
# of the final file holding both data and metadata
file_meta_data <- file.path("./data/processed/", paste0(meta_wb_name, ".xlsx"))
## Check if the final file already exist
if (file.exists(file_meta_data)) {
## If it does...
# Read in the current metadata and variable sheets
current_var_definitions <- read.xlsx(file_meta_data, sheet = "variable definitions")
current_meta_data <- read.xlsx(file_meta_data, sheet = "meta data")
# Update them with the potential new values
#' Update existing metadata and variable definition.
#'
#' In practice this function is called only if the final workbook already exists.
#' Its main goal is to prevent the necessary manual updates in metadata and variable
#' definition sheet from being discarded when the script is run again.
#'
#' @param workbook A work book object.
#' @param meta_data A data.frame containing metadata template.
#' @param var_definitions A data.frame containing variable definitions template.
#'
#' @return A list with the following fields:
#' * meta_data [data.frame]
#' * var_definitions [data.frame]
update_meta_data <- function(workbook, meta_data, var_definitions) {
current_var_definitions <- read.xlsx(workbook$path,
sheet = "variable definitions")
current_meta_data <- read.xlsx(workbook$path,
sheet = "meta data")
# Update current meta_data and var_definitions with the potential new values
var_definitions <- merge(current_var_definitions, var_definitions, all.x = TRUE)
meta_data <- merge(current_meta_data, meta_data, all.x = TRUE)
# Unfortunately merge() does not preserve the original order
# We need to set that back manually
var_definitions <- var_definitions[match(variable_col, var_definitions$variable), ]
meta_data <- meta_data[match(field, meta_data$field), ]
order_var_definitions <- match(current_var_definitions$variable,
var_definitions$variable)
var_definitions <- var_definitions[order_var_definitions, ]
order_meta_data <- match(current_meta_data$field, meta_data$field)
meta_data <- meta_data[order_meta_data, ]
updated_meta_data <- list(var_definitions = var_definitions,
meta_data = meta_data)
return(updated_meta_data)
}
#' Save workbook as a xlsx file
#'
#' Workbook is saved at the path specified in `workbook$path`.
#' By default "data/processed/". To modify the location where the workbook
#' is saved, you must modified the `output_path` argument when instantiating
#' the workbook with `instantiate_workbook()`.
#'
#' @param workbook A work book object.
#' @param meta_data A data.frame containing metadata template.
#' @param var_definitions A data.frame containing variable definitions template.
save_excel_file <- function(workbook, meta_data, var_definitions) {
addWorksheet(workbook$wb, "meta data")
addWorksheet(workbook$wb, "variable definitions")
writeData(workbook$wb, "meta data", meta_data)
writeData(workbook$wb, "variable definitions", var_definitions)
saveWorkbook(wb = workbook$wb,
file = workbook$path,
overwrite = TRUE)
}
### Raw file processing -------------------------------------------------------- #
raw_data_files <- list_raw_data_files()
for (file in raw_data_files) {
print(sprintf("Processing: %s", file$path))
if (!(file$extension %in% c("csv", "xlsx"))) {
warning(sprintf("File extension not supported: %s, file skipped.", file$extension))
}
workbook <- instantiate_workbook(file)
## Add meta data and variable definition sheets to the workbook template
addWorksheet(meta_wb, "meta data")
addWorksheet(meta_wb, "variable definitions")
writeData(meta_wb, "meta data", meta_data)
writeData(meta_wb, "variable definitions", var_definitions)
for (sheet in workbook$sheets) {
processed_sheet <- process_sheet(file, sheet)
workbook <- update_workbook_sheet_data(workbook, processed_sheet)
}
## Finally save the newly created workbook
saveWorkbook(wb = meta_wb,
file = file_meta_data,
overwrite = TRUE)
var_definitions <- create_variable_definition_df(workbook)
meta_data <- instantiate_meta_data_template()
if (file.exists(workbook$path)) {
updated_meta_data <- update_meta_data(workbook, meta_data, var_definitions)
meta_data <- updated_meta_data$meta_data
var_definitions <- updated_meta_data$var_definitions
}
save_excel_file(workbook = workbook,
meta_data = meta_data,
var_definitions = var_definitions)
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment