Commit fdc5eaa7 authored by bob's avatar bob
Browse files

Small syntactic improvements

parent 711180cb
......@@ -6,7 +6,7 @@
required_packages <- c("ggplot2", "openxlsx", "emmeans")
for (package in required_packages) {
if(package %in% rownames(installed.packages()) == FALSE){
if (package %in% rownames(installed.packages()) == FALSE) {
install.packages(package)
}
}
......@@ -16,5 +16,5 @@ for (package in required_packages) {
all_scripts <- list.files("./scripts", full.names = TRUE)
target_scripts <- all_scripts[grep("[1-9]", all_scripts)]
for(script in target_scripts) source(script, local = TRUE)
for (script in target_scripts) source(script, local = TRUE)
......@@ -11,27 +11,27 @@ library(openxlsx)
##### Meta data template ---------------------------------------------------------------------------
field <- c("Data ID", "Official title of the dataset", "Project name",
"Description of project", "Author", "Author ID(ORCID)",
"Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project",
"creation date (m/d/yyyy)", "Embargo end date", "Citation",
"keywords (AGROVOC)", "Country(ies) covered", "Point longitude coord. in Dec. Degrees",
field <- c("Data ID", "Official title of the dataset", "Project name",
"Description of project", "Author", "Author ID(ORCID)",
"Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project",
"creation date (m/d/yyyy)", "Embargo end date", "Citation",
"keywords (AGROVOC)", "Country(ies) covered", "Point longitude coord. in Dec. Degrees",
"Agro-Ecological Zone(s)(FAO) covered", "Years covered by data", "Crops covered by data",
"Animals covered by data", "Start date of data collection", "End date of data collection",
"Animals covered by data", "Start date of data collection", "End date of data collection",
"License (default=CC-BY)", "Permission given by email", "Rights", "Contact email")
field_name <- c("data.id", "data.title", "project.name", "project.description",
"author", "orcid", "contributors", "subject.research",
"data.origin", "donor", "date.creation", "date.embargo",
"citation", "keywords.agrovoc", "countries", "longitude",
"aez", "years", "crops", "animals",
"date.collect.start", "date.collect.end", "licence", "permission",
field_name <- c("data.id", "data.title", "project.name", "project.description",
"author", "orcid", "contributors", "subject.research",
"data.origin", "donor", "date.creation", "date.embargo",
"citation", "keywords.agrovoc", "countries", "longitude",
"aez", "years", "crops", "animals",
"date.collect.start", "date.collect.end", "licence", "permission",
"rights", "contact.mail")
meta_data <- data.frame(field = field,
field_name = field_name,
values = NA,
stringsAsFactors = FALSE)
values = NA,
stringsAsFactors = FALSE)
##### Raw file processing --------------------------------------------------------------------------
......@@ -41,117 +41,117 @@ raw_data_files <- list.files("./data/raw", full.names = TRUE)
### Iterate through the file list
for (file in raw_data_files) {
print(sprintf("Processing: %s", file))
## Extract the file extension
file_extension <- regmatches(file, regexpr("(?<=\\.)[a-z]+$", file, perl = TRUE))
## Extract file name and eventually worksheet names
if(file_extension == "csv") {
if (file_extension == "csv") {
file_name <- gsub("\\.csv$", "", basename(file))
sheets <- "data"
} else if(file_extension == "xlsx") {
} else if (file_extension == "xlsx") {
file_name <- gsub("\\.xlsx$", "", basename(file))
sheets <- getSheetNames(file)
} else {
warning(sprintf("File extension not supported: %s, file skipped.", file_extension))
next
}
## Create the final name of the workbook which will hold the file data and metadata
meta_wb_name <- paste0(file_name, "_metadata")
# ... and a workbook template
meta_wb <- createWorkbook(meta_wb_name)
## Create variables to hold relevant values to extract from the file
variable_col <- c() # variable names
unit_col <- c() # units
sheet_col <- c() # worksheet name
### Iterate through the worksheets
for (sheet in sheets) {
## Properly read in the file depending on its extension.
# The first line is skipped. Otherwise the special place holder '$'
# The first line is skipped. Otherwise the special place holder '$'
# separating variable name and unit might get coerced to a '.'
if(file_extension == "csv") {
if (file_extension == "csv") {
dat <- read.csv(file, header = FALSE, fileEncoding = "UTF-8-BOM")
}
if(file_extension == "xlsx") {
}
if (file_extension == "xlsx") {
dat <- read.xlsx(file, sheet = sheet, colNames = FALSE)
}
if(is.null(dat)) next
if (is.null(dat)) next
## From the first row of the file, extract...
header <- strsplit(as.character(dat[1, ]), "\\$")
variables <- sapply(header, `[`, 1) # variable names
units <- sapply(header, `[`, 2) # units
## Assign variable names as column names
names(dat) <- variables
# and delete the first row
dat <- dat[-1, ]
## When several sheets are present in your workbook
# append the relevant values one after the other
variable_col <- c(variable_col, variables)
unit_col <- c(unit_col, units)
sheet_col <- c(sheet_col, rep(sheet, length(variables)))
## Add data sheet to the workbook template
addWorksheet(meta_wb, sheet)
# and write the data
# and write the data
writeData(meta_wb, sheet, dat)
}
## Create the variable definition data.frame for that file
var_definitions <- data.frame("workbook" = file_name,
"sheet" = sheet_col,
"variable"= variable_col,
"unit"= unit_col,
"variable" = variable_col,
"unit" = unit_col,
"definition" = NA,
"unique identifier" = 0,
"personal information" = 0,
"personal information" = 0,
stringsAsFactors = FALSE)
## Create the full name (relative path included)
## Create the full name (relative path included)
# of the final file holding both data and metadata
file_meta_data <- file.path("./data/processed/", paste0(meta_wb_name, ".xlsx"))
## Check if the final file already exist
if(file.exists(file_meta_data)) {
if (file.exists(file_meta_data)) {
## If it does...
# Read in the current metadata and variable sheets
current_var_definitions <- read.xlsx(file_meta_data, sheet = "variable definitions")
current_meta_data <- read.xlsx(file_meta_data, sheet = "meta data")
# Update them with the potential new values
var_definitions <- merge(current_var_definitions, var_definitions, all.x = TRUE)
meta_data <- merge(current_meta_data, meta_data, all.x = TRUE)
# Unfortunately merge() does not preserve the original order
# We need to set that back manually
var_definitions <- var_definitions[match(variable_col, var_definitions$variable), ]
meta_data <- meta_data[match(field, meta_data$field), ]
}
## Add meta data and variable definition sheets to the workbook template
addWorksheet(meta_wb, "meta data")
addWorksheet(meta_wb, "variable definitions")
writeData(meta_wb, "meta data", meta_data)
writeData(meta_wb, "variable definitions", var_definitions)
## Finally save the newly created workbook
saveWorkbook(wb = meta_wb,
file = file_meta_data,
file = file_meta_data,
overwrite = TRUE)
}
......
......@@ -12,9 +12,10 @@ output_sheet_name <- "data"
##### Load data ------------------------------------------------------------------------------------
# Raw (initial processed) data set
d <- read.xlsx("./data/processed/Meststof proef WUR_metadata.xlsx", sheet = "data")
metadata <- read.xlsx("./data/processed/Meststof proef WUR_metadata.xlsx", sheet = "meta data")
variable_definition <- read.xlsx("./data/processed/Meststof proef WUR_metadata.xlsx", sheet = "variable definitions")
path_data <- "data/processed/Meststof proef WUR_metadata.xlsx"
d <- read.xlsx(path_data, sheet = "data")
metadata <- read.xlsx(path_data, sheet = "meta data")
variable_definition <- read.xlsx(path_data, sheet = "variable definitions")
##### Clean data -----------------------------------------------------------------------------------
......@@ -51,7 +52,7 @@ wb <- createWorkbook()
sheet_names <- c("data","meta data","variable definitions")
sheet_data <- list(d, metadata, variable_definition)
# Add the sheet to the workbook template
# Add the sheet to the workbook template
lapply(sheet_names, function(x) addWorksheet(wb, sheetName = x))
# Write data to the proper sheets
mapply(function(name, data) writeData(wb, name, data), sheet_names, sheet_data)
......
......@@ -14,12 +14,12 @@ d <- read.xlsx("./data/processed/fertilizer_trial_WUR_cleaned.xlsx",
##### Analysis -------------------------------------------------------------------------------------
## Simple summary graph
summary_plot <- ggplot(d)+ # initialize graph
aes(x = fertilizer, y = yield, colour = farm)+ # define relation between graph properties and variables
geom_point(na.rm = TRUE)+ # define plot type, here scatter plot
facet_wrap(farm ~ .)+ # allow faceting => each farm corresponds to one pane
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+ # rotate axis labels
ylim(0, 15)+ # set y axis limits
summary_plot <- ggplot(d) + # initialize graph
aes(x = fertilizer, y = yield, colour = farm) + # define relation between graph properties and variables
geom_point(na.rm = TRUE) + # define plot type, here scatter plot
facet_wrap(farm ~ .) + # allow faceting => each farm corresponds to one pane
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + # rotate axis labels
ylim(0, 15) + # set y axis limits
ggtitle("Summary plot") # add title
# Display the summary plot
......@@ -56,15 +56,15 @@ sink()
emms_df <- as.data.frame(emms)
emms_plot <- ggplot(d)+
aes(x = fertilizer, y = yield, colour = farm)+ # raw data
emms_plot <- ggplot(d) +
aes(x = fertilizer, y = yield, colour = farm) + # raw data
geom_point(na.rm = TRUE,
position = position_dodge(width = 0.2),
alpha = 0.2)+
alpha = 0.2) +
geom_pointrange(data = emms_df, # Estimated Marginal Means
aes(y = emmean, ymin = lower.CL, ymax = upper.CL),
position = position_dodge(width = 0.2))+
ylim(0, 15)+
position = position_dodge(width = 0.2)) +
ylim(0, 15) +
ggtitle("Estimated Marginal Means")
# Display emms plots
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment