From 5eedcbfb694feac8c50e30b1e80ea50d190f6b0e Mon Sep 17 00:00:00 2001
From: bob <antoinelanguillaume@gmail.com>
Date: Tue, 2 Mar 2021 08:26:29 +0100
Subject: [PATCH] Refactor metadata extraction script

- Break the logic into adequately named functions
- Add documentation using Roxygen syntax
---
 scripts/1_extract_metadata.R | 405 ++++++++++++++++++++++++-----------
 1 file changed, 282 insertions(+), 123 deletions(-)

diff --git a/scripts/1_extract_metadata.R b/scripts/1_extract_metadata.R
index c46759b..87c31ea 100644
--- a/scripts/1_extract_metadata.R
+++ b/scripts/1_extract_metadata.R
@@ -1,158 +1,317 @@
-### -------------------------------------------------------------------------------------------- ###
-# PPS Data Management Plan companion script to create excel workbooks including proper meta data   #
-### -------------------------------------------------------------------------------------------- ###
+### -------------------------------------------------------------------------- ###
+# PPS Data Management Plan companion script to create excel workbooks
+# including proper meta data
+### -------------------------------------------------------------------------- ###
 
+library(openxlsx)
 
-##### Required package(s) --------------------------------------------------------------------------
+### Functions ------------------------------------------------------------------ #
+
+
+#' Create template of metadata sheet
+#'
+#' Once the data process the user can manually update the metadata sheet
+#' in the resulting excel workbook.
+#'
+#' @return A data.frame with columns field, field_name and values.
+#' This last is to be filled manually.
+instantiate_meta_data_template <- function() {
+
+  field <- c("Data ID", "Official title of the dataset", "Project name",
+             "Description of project", "Author", "Author ID(ORCID)",
+             "Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project",
+             "creation date (m/d/yyyy)", "Embargo end date", "Citation",
+             "keywords (AGROVOC)", "Country(ies) covered", "Point longitude coord. in Dec. Degrees",
+             "Agro-Ecological Zone(s)(FAO) covered", "Years covered by data", "Crops covered by data",
+             "Animals covered by data", "Start date of data collection", "End date of data collection",
+             "License (default=CC-BY)", "Permission given by email", "Rights", "Contact email")
+
+  field_name <- c("data.id", "data.title", "project.name", "project.description",
+                  "author", "orcid", "contributors", "subject.research",
+                  "data.origin", "donor", "date.creation", "date.embargo",
+                  "citation", "keywords.agrovoc", "countries", "longitude",
+                  "aez", "years", "crops", "animals",
+                  "date.collect.start", "date.collect.end", "licence", "permission",
+                  "rights", "contact.mail")
+
+  meta_data <- data.frame(field = field,
+                          field_name = field_name,
+                          values = NA,
+                          stringsAsFactors = FALSE)
+  return(meta_data)
+}
 
-# To write data to an excel workbook
-library(openxlsx)
 
+#' Extract file extension
+#'
+#' @param file A file object yielded by `read_raw_data_file()`.
+#' @return A character indicating the file extension.
+extract_file_extension <- function(file) {
+  regmatches(file, regexpr("(?<=\\.)[a-z]+$", file, perl = TRUE))
+}
 
-##### Meta data template ---------------------------------------------------------------------------
 
-field <- c("Data ID", "Official title of the dataset", "Project name",
-           "Description of project", "Author", "Author ID(ORCID)",
-           "Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project",
-           "creation date (m/d/yyyy)", "Embargo end date", "Citation",
-           "keywords (AGROVOC)", "Country(ies) covered", "Point longitude coord. in Dec. Degrees",
-           "Agro-Ecological Zone(s)(FAO) covered", "Years covered by data", "Crops covered by data",
-           "Animals covered by data", "Start date of data collection", "End date of data collection",
-           "License (default=CC-BY)", "Permission given by email", "Rights", "Contact email")
+#' Make a list of raw data files in projects
+#'
+#' @param path A character string indicating the relative path
+#' to the folder where raw data is stored.
+#' By default raw data is assumed to be in "data/raw/".
+#' @return A list of data.frame. Each data.frame as two columns: path and extension.
+#' Respectively holding the relative path to the file and the file extension.
+#' Later this list of data.frame is referred to as a *file object*.
+list_raw_data_files <- function(path = "data/raw/") {
+  paths <- list.files(path, full.names = TRUE)
+  extensions <- vapply(paths, extract_file_extension,
+                       FUN.VALUE = character(1), USE.NAMES = FALSE)
+  raw_data_files <- data.frame(path = paths,
+                               extension = extensions)
+  raw_data_files <- split(raw_data_files, 1:nrow(raw_data_files))
+  return(raw_data_files)
+}
 
-field_name <- c("data.id", "data.title", "project.name", "project.description",
-                "author", "orcid", "contributors", "subject.research",
-                "data.origin", "donor", "date.creation", "date.embargo",
-                "citation", "keywords.agrovoc", "countries", "longitude",
-                "aez", "years", "crops", "animals",
-                "date.collect.start", "date.collect.end", "licence", "permission",
-                "rights", "contact.mail")
 
-meta_data <- data.frame(field = field,
-                        field_name = field_name,
-                        values = NA,
-                        stringsAsFactors = FALSE)
+#' Extract file name
+#' file name <=> string before file extension
+#' @param file A file object yielded by read_raw_data_file().
+#' @param supported_extensions A character vector of supported extensions.
+#' @return A character corresponding to the string before file extension.
+extract_file_name <- function(file, supported_extensions = c("csv", "xlsx")) {
+  extension_pattern <- paste0(supported_extensions, collapse = "|")
+  pattern <- sprintf("\\.(%s)$", extension_pattern)
+  gsub(pattern, "", basename(file$path))
+}
 
 
-##### Raw file processing --------------------------------------------------------------------------
+#' Given a file extract the data sheets
+#'
+#' @param file A file object yielded by read_raw_data_file().
+#' @return Depending on the file$extension:
+#' * if file$extension is "csv", a character string "data"
+#' * if file$extension is "xlsx", a character vector with all sheet names.
+extract_data_sheets <- function(file) {
+  switch(file$extension,
+         "csv" = "data",
+         "xlsx" = openxlsx::getSheetNames(file$path)
+  )
+}
 
-## Make a list of all files in ./data/raw
-raw_data_files <- list.files("./data/raw", full.names = TRUE)
 
-### Iterate through the file list
-for (file in raw_data_files) {
+#' Instantiate a workbook object
+#'
+#' This is the most important function of the script. It defines the one major
+#' object which is passed from function to function and encapsulate all required data.
+#' The workbook object if the R skeleton of the future excel workbook that will contain
+#' data, meta data and variable definitions.
+#'
+#' @param file A file object yielded by read_raw_data_file().
+#' @param output_path A character string. Path where final workbook will written.
+#' Default to "data/processed/"
+#'
+#' @return A list later referred to as *workbook object*, containing the
+#' following fields:
+#' * original_name [character]: name of the original file without extension.
+#' * extension [character]: original file extension.
+#' * path [charcater]: entire path including file name where workbook should be saved.
+#' * workbook [openxlsx Workbook Object]
+#' * variables [named list]: holding 3 fields: name, unit and sheet used to construct
+#'  variable definitions.
+#' * sheets [character vector]: name of the workbook sheets to be processed.
+instantiate_workbook <- function(file, output_path = "data/processed/") {
+
+  file_name <- extract_file_name(file)
+  workbook_name <- paste0(file_name, "_metadata")
+  full_name <- paste0(workbook_name, ".xlsx")
+
+  variables <- list(name =  vector("character"),
+                    unit = vector("character"),
+                    sheet = vector("character"))
+
+  workbook <- list(original_name = file_name,
+                  extension = file$extension,
+                  path = file.path(output_path, full_name),
+                  wb = createWorkbook(workbook_name),
+                  variables = variables,
+                  sheets =  extract_data_sheets(file))
+  return(workbook)
+}
 
-  print(sprintf("Processing: %s", file))
-
-  ## Extract the file extension
-  file_extension <- regmatches(file, regexpr("(?<=\\.)[a-z]+$", file, perl = TRUE))
-
-  ## Extract file name and eventually worksheet names
-  if (file_extension == "csv") {
-    file_name <- gsub("\\.csv$", "", basename(file))
-    sheets <- "data"
-  }  else if (file_extension == "xlsx") {
-    file_name <- gsub("\\.xlsx$", "", basename(file))
-    sheets <- getSheetNames(file)
-  } else {
-    warning(sprintf("File extension not supported: %s, file skipped.", file_extension))
-    next
-  }
 
-  ## Create the final name of the workbook which will hold the file data and metadata
-  meta_wb_name <- paste0(file_name, "_metadata")
-  # ... and a workbook template
-  meta_wb <- createWorkbook(meta_wb_name)
-
-  ## Create variables to hold relevant values to extract from the file
-  variable_col <- c() # variable names
-  unit_col <- c() # units
-  sheet_col <- c() # worksheet name
-
-  ### Iterate through the worksheets
-  for (sheet in sheets) {
-
-    ## Properly read in the file depending on its extension.
-    # The first line is skipped. Otherwise the special place holder '$'
-    # separating variable name and unit might get coerced to a '.'
-    if (file_extension == "csv") {
-      #dat <- read.csv(file, header = FALSE, fileEncoding = "UTF-8-BOM")
-      dat <- read.csv(file, skip = 1)
-    }
-    if (file_extension == "xlsx") {
-      dat <- read.xlsx(file, sheet = sheet, colNames = FALSE)
-    }
-    if (is.null(dat)) next
-
-    ## From the first row of the file, extract...
-    header <- strsplit(as.character(dat[1, ]), "\\$")
-    variables <- sapply(header, `[`, 1) # variable names
-    units <- sapply(header, `[`, 2) # units
-
-    ## Assign variable names as column names
-    names(dat) <- variables
-    # and delete the first row
-    dat <- dat[-1, ]
-
-    ## When several sheets are present in your workbook
-    # append the relevant values one after the other
-    variable_col <- c(variable_col, variables)
-    unit_col <- c(unit_col, units)
-    sheet_col <- c(sheet_col, rep(sheet, length(variables)))
-
-    ## Add data sheet to the workbook template
-    addWorksheet(meta_wb, sheet)
-    # and write the data
-    writeData(meta_wb, sheet, dat)
+#' Read data from a given sheet
+#' @param file A file object yielded by read_raw_data_file().
+#' @param sheet A character indicating sheet name.
+#' @return A data.frame
+read_data_sheet <- function(file, sheet) {
+  ## Properly read in the file depending on its extension.
+  # The first line is skipped. Otherwise the special place holder '$'
+  # separating variable name and unit might get coerced to a '.'
+  switch(file$extension,
+         "csv" = read.csv(file$path, header = FALSE, fileEncoding = "UTF-8-BOM"),
+         "xlsx" = read.xlsx(file$path, sheet = sheet, colNames = FALSE)
+  )
+}
 
-  }
 
-  ## Create the variable definition data.frame for that file
-  var_definitions <- data.frame("workbook" = file_name,
-                                "sheet" = sheet_col,
-                                "variable" = variable_col,
-                                "unit" = unit_col,
+#' Process a given data sheet
+#'
+#' @param file A file object yielded by read_raw_data_file().
+#' @param sheet A character indicating sheet name.
+#'
+#' @return A list later referred to as a *processed sheet object* containing
+#' the following fields:
+#' * name [character]: sheet name.
+#' * variables [character vector]: variable names.
+#' * units [character vector]: variables units.
+#' * data [data.frame]: holding sheet data.
+process_sheet <- function(file, sheet) {
+
+  dat <- read_data_sheet(file, sheet)
+
+  ## From the first row of the file, extract...
+  header <- strsplit(as.character(dat[1, ]), "\\$")
+  variables <- sapply(header, `[`, 1) # variable names
+  units <- sapply(header, `[`, 2) # units
+
+  ## Assign variable names as column names
+  names(dat) <- variables
+  # and delete the first row
+  dat <- dat[-1, ]
+
+  processed_sheet <- list(
+    name = sheet,
+    variables = variables,
+    units = units,
+    data = dat
+  )
+  return(processed_sheet)
+}
+
+
+#' Update workbook object with a given processed sheet
+#' @param workbook A work book object.
+#' @param processed_sheet A processed sheet object.
+#' @return A workbook object updated with data from `processed_sheet`.
+update_workbook_sheet_data <- function(workbook, processed_sheet) {
+  ## When several sheets are present in your workbook
+  # append the relevant values one after the other
+  workbook$variables$name <- c(workbook$variables$name, processed_sheet$variables)
+  workbook$variables$unit <- c(workbook$variables$unit, processed_sheet$units)
+  sheet_vector <- rep(processed_sheet$name, length(processed_sheet$variables))
+  workbook$variables$sheet <- c(workbook$variables$sheet, sheet_vector)
+
+  addWorksheet(workbook$wb, processed_sheet$name)
+  writeData(workbook$wb, processed_sheet$name, processed_sheet$data)
+
+  return(workbook)
+}
+
+
+#' Create variable definition data.frame
+#' @param workbook A work book object.
+#' @return A data.frame holding variable and units to be updated manually later on.
+create_variable_definition_df <- function(workbook) {
+  var_definitions <- data.frame("workbook" = workbook$original_name,
+                                "sheet" = workbook$variables$sheet,
+                                "variable" = workbook$variables$name,
+                                "unit" = workbook$variables$unit,
                                 "definition" = NA,
                                 "unique identifier" = 0,
                                 "personal information" = 0,
                                 stringsAsFactors = FALSE)
+  return(var_definitions)
+}
 
-  ## Create the full name (relative path included)
-  # of the final file holding both data and metadata
-  file_meta_data <- file.path("./data/processed/", paste0(meta_wb_name, ".xlsx"))
+#' Update existing metadata and variable definition.
+#'
+#' In practice this function is called only if the final workbook already exists.
+#' Its main goal is to prevent the necessary manual updates in metadata and variable
+#' definition sheet from being discarded when the script is run again.
+#'
+#' @param workbook A work book object.
+#' @param meta_data A data.frame containing metadata template.
+#' @param var_definitions A data.frame containing variable definitions template.
+#'
+#' @return A list with the following fields:
+#'  * meta_data [data.frame]
+#'  * var_definitions [data.frame]
+update_meta_data <- function(workbook, meta_data, var_definitions) {
+
+  current_var_definitions <- read.xlsx(workbook$path,
+                                       sheet = "variable definitions")
+  current_meta_data <- read.xlsx(workbook$path,
+                                 sheet = "meta data")
+
+  # Update current meta_data and var_definitions with the potential new values
+  var_definitions <- merge(current_var_definitions, var_definitions, all.x = TRUE)
+  meta_data <- merge(current_meta_data, meta_data, all.x = TRUE)
+
+  # Unfortunately merge() does not preserve the original order
+  # We need to set that back manually
+  order_var_definitions <- match(current_var_definitions$variable,
+                                 var_definitions$variable)
+  var_definitions <- var_definitions[order_var_definitions, ]
+  order_meta_data <- match(current_meta_data$field, meta_data$field)
+  meta_data <- meta_data[order_meta_data, ]
+
+  updated_meta_data <- list(var_definitions = var_definitions,
+                            meta_data = meta_data)
+  return(updated_meta_data)
+}
 
-  ## Check if the final file already exist
-  if (file.exists(file_meta_data)) {
+#' Save workbook as a xlsx file
+#'
+#' Workbook is saved at the path specified in `workbook$path`.
+#' By default "data/processed/". To modify the location where the workbook
+#' is saved, you must modified the `output_path` argument when instantiating
+#' the workbook with `instantiate_workbook()`.
+#'
+#' @param workbook A work book object.
+#' @param meta_data A data.frame containing metadata template.
+#' @param var_definitions A data.frame containing variable definitions template.
+save_excel_file <- function(workbook, meta_data, var_definitions) {
+
+  addWorksheet(workbook$wb, "meta data")
+  addWorksheet(workbook$wb, "variable definitions")
+  writeData(workbook$wb, "meta data", meta_data)
+  writeData(workbook$wb, "variable definitions", var_definitions)
+
+  saveWorkbook(wb = workbook$wb,
+               file = workbook$path,
+               overwrite = TRUE)
+}
 
-    ## If it does...
 
-    # Read in the current metadata and variable sheets
-    current_var_definitions <- read.xlsx(file_meta_data, sheet = "variable definitions")
-    current_meta_data <- read.xlsx(file_meta_data, sheet = "meta data")
 
-    # Update them with the potential new values
-    var_definitions <- merge(current_var_definitions, var_definitions, all.x = TRUE)
-    meta_data <- merge(current_meta_data, meta_data, all.x = TRUE)
+### Raw file processing -------------------------------------------------------- #
 
-    # Unfortunately merge() does not preserve the original order
-    # We need to set that back manually
-    var_definitions <- var_definitions[match(variable_col, var_definitions$variable), ]
-    meta_data <- meta_data[match(field, meta_data$field), ]
+raw_data_files <- list_raw_data_files()
 
+for (file in raw_data_files) {
+
+  print(sprintf("Processing: %s", file$path))
+
+  if (!(file$extension %in% c("csv", "xlsx"))) {
+    warning(sprintf("File extension not supported: %s, file skipped.", file$extension))
   }
 
+  workbook <- instantiate_workbook(file)
 
-  ## Add meta data and variable definition sheets to the workbook template
-  addWorksheet(meta_wb, "meta data")
-  addWorksheet(meta_wb, "variable definitions")
-  writeData(meta_wb, "meta data", meta_data)
-  writeData(meta_wb, "variable definitions", var_definitions)
+  for (sheet in workbook$sheets) {
+    processed_sheet <- process_sheet(file, sheet)
+    workbook <- update_workbook_sheet_data(workbook, processed_sheet)
+  }
 
-  ## Finally save the newly created workbook
-  saveWorkbook(wb = meta_wb,
-               file = file_meta_data,
-               overwrite = TRUE)
+  var_definitions <- create_variable_definition_df(workbook)
+  meta_data <- instantiate_meta_data_template()
+
+  if (file.exists(workbook$path)) {
+    updated_meta_data <- update_meta_data(workbook, meta_data, var_definitions)
+    meta_data <- updated_meta_data$meta_data
+    var_definitions <- updated_meta_data$var_definitions
+  }
 
+  save_excel_file(workbook = workbook,
+                  meta_data = meta_data,
+                  var_definitions = var_definitions)
 }
 
 
-- 
GitLab