Skip to content
Snippets Groups Projects
Commit 3d1d986c authored by Helfenstein, Anatol's avatar Helfenstein, Anatol :speech_balloon:
Browse files

exploratory analysis files of pH and SOM modelling data; not fully...

exploratory analysis files of pH and SOM modelling data; not fully reproducible at the moment because would require SOM field estimates and independent LSK and CCNL datasets; just to give idea of ways to explore data distribution before beginning model training
parent 53b3aaca
Branches
No related tags found
No related merge requests found
---
title: "Exploratory Analysis of Modelling Data"
subtitle: "SOM [%]"
author: "Anatol Helfenstein"
date: "`r Sys.Date()`"
output:
html_document:
toc: yes
toc_float: yes
'': default
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = FALSE)
options(width = 100) # sets width of R code output (not images)
```
```{r load required pkgs and data, include = FALSE}
# load packages
pkgs <- c("tidyverse", "raster", "rgdal", "sf", "rasterVis", "viridis",
"foreach", "ggspatial", "cowplot", "cmocean", "scales")
lapply(pkgs, library, character.only = TRUE)
# 1) Specify DSM target soil property:
TARGET = "SOM_per"
TARGET_EXP = "SOM [%]"
# 2) Specify observation quality & dynamic or static model
OBS_QUAL = "" # lab measurements & field estimates: ""; lab meas. only: "_lab"
TIME = "_dyn" # calibration 3D+T model using 2D+T & 3D+T dynamic covariates
TIME_DIR = "dynamic" # either "static" or "dynamic"; directory to save plots
# read in regression matrix of TARGET soil property
tbl_regmat_target <- read_rds(paste0("out/data/model/", TARGET, "/", TIME_DIR,
"/tbl_regmat_", TARGET, OBS_QUAL, TIME, ".Rds"))
# remane "train" and "test" to match other plots in paper and specify number
tbl_regmat_target <- tbl_regmat_target %>%
mutate(split = case_when(split %in% "train" ~ "Calibration",
split %in% "test" ~ "Validation"))
# set order of datasets so we always show calibration first, then validation
dataset_order <- c("Calibration", "Validation")
tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
levels = dataset_order)
# convert to sf
sf_regmat_target <- tbl_regmat_target %>%
st_as_sf(., coords = c("X", "Y"), crs = "EPSG:28992")
# 3) Set plotting axis min, max, range and breaks
XY_MIN_CAL = 0
XY_MAX_CAL = 100
XY_RANGE_CAL = diff(range(XY_MIN_CAL, XY_MAX_CAL))
XY_BREAKS_CAL = unique(round(seq(XY_MIN_CAL, XY_MAX_CAL, XY_RANGE_CAL/10)))
# validation/test data
XY_MIN_VAL = 0
XY_MAX_VAL = 100
XY_RANGE_VAL = diff(range(XY_MIN_VAL, XY_MAX_VAL))
XY_BREAKS_VAL = unique(round(seq(XY_MIN_VAL, XY_MAX_VAL, XY_RANGE_VAL/10)))
# 4) Read in NL border shapefile for mapping
sf_NL_borders <- st_read("data/other/NL_borders.shp")
```
## Maps
### Spatial density maps
```{r map spatial density, echo = FALSE, warning = FALSE, message = FALSE}
# number of locations (calibration and validation)
n_sites = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
group_by(X,Y) %>%
tally() %>%
nrow())))
# plot spatial density plot of all observations (calibration and validation)
(m_target_density <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_hex(data = tbl_regmat_target,
aes(x = X, y = Y), binwidth = 3000) + # 3km binwidth
scale_fill_viridis(option = "inferno", direction = -1) +
geom_text(aes(x = Inf, y = -Inf, label = n_sites),
size = 3, hjust = 6, vjust = -32, parse = TRUE) +
theme(legend.position = c(0.1, 0.8),
plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(fill = "Number of observations") +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# save to disk
# ggsave(filename = paste0("m_", TARGET, "_spatial_density.pdf"),
# plot = m_target_density,
# path = "out/figs/explorative/SOM_per",
# width = 5, height = 5)
# number of locations with lab measurements (calibration and validation)
n_sites_lab = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(BIS_type %in% "lab") %>%
group_by(X,Y) %>%
tally() %>%
nrow())))
# plot spatial density plot of all lab measurements (calibration and validation)
(m_target_density_lab <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_hex(data = filter(tbl_regmat_target, BIS_type %in% "lab"),
aes(x = X, y = Y), binwidth = 3000) +
scale_fill_viridis(option = "inferno", direction = -1) +
geom_text(aes(x = Inf, y = -Inf, label = n_sites_lab),
size = 3, hjust = 8, vjust = -30, parse = TRUE) +
theme(legend.position = c(0.1, 0.8),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(fill = "Number of lab \n measurements") +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# save to disk
# ggsave(filename = paste0("m_", TARGET, "_spatial_density_lab.pdf"),
# plot = m_target_density_lab,
# path = "out/figs/explorative/SOM_per",
# width = 5, height = 5)
```
### Spatial point maps
```{r map spatial points cal val, echo = FALSE, warning = FALSE, message = FALSE}
# number of locations
n_sites = as.character(as.expression(paste0(
"italic(n) == ", tbl_regmat_target %>%
group_by(X,Y) %>%
slice(1L) %>%
nrow()
)))
# convert to sf
sf_regmat_target_locations <- tbl_regmat_target %>%
# also group by type to not remove lab measurement locations where there are field
# estimates of horizons for which there are no lab measurements (see script 31 -
# removal of field estimates...)
group_by(X, Y, BIS_type) %>%
slice(1L) %>%
st_as_sf(., coords = c("X", "Y"), crs = "EPSG:28992") %>%
# change names for plot labels
mutate(BIS_type = case_when(BIS_type %in% "field" ~ "Field estimate",
BIS_type %in% "lab" ~ "Lab measurement"))
# plot all locations
(m_target <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_sf(data = sf_regmat_target_locations,
aes(color = BIS_type), alpha = 0.5, shape = 19, size = 0.25) +
scale_color_manual(values = c("#1b9e77", "#d95f02")) +
geom_text(aes(x = Inf, y = -Inf, label = n_sites),
size = 3, hjust = 5, vjust = -25, parse = TRUE) +
labs(col = "SOM observations") +
guides(color = guide_legend(override.aes = list(alpha = 1, size = 3))) +
theme(legend.position = c(0.1, 0.8),
plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# save to disk
# ggsave(filename = paste0("m_", TARGET, "_spatial_point.pdf"),
# plot = m_target,
# path = "out/figs/explorative/SOM_per",
# width = 5, height = 5)
# number of calibration locations
n_sites_cal = as.character(as.expression(paste0(
"italic(n) == ", tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
group_by(X,Y) %>%
slice(1L) %>%
nrow()
)))
# plot calibration locations
(m_target_cal <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_sf(data = sf_regmat_target_locations %>%
filter(split %in% "Calibration"),
aes(color = BIS_type), alpha = 0.5, shape = 19, size = 0.25) +
scale_color_manual(values = c("#1b9e77", "#d95f02")) +
geom_text(aes(x = Inf, y = -Inf, label = n_sites_cal),
size = 3, hjust = 6, vjust = -32, parse = TRUE) +
# ggtitle("Calibration") +
theme(legend.position = c(0.1, 0.8),
plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(col = "Calibration locations") +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# save to disk
# ggsave(filename = paste0("m_", TARGET, "_spatial_point_cal.pdf"),
# plot = m_target_cal,
# path = "out/figs/explorative/SOM_per",
# width = 5, height = 5)
# number of lab calibration locations (so excluding field estimates)
n_sites_cal_lab = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
filter(BIS_type %in% "lab") %>%
group_by(X,Y) %>%
tally() %>%
nrow())))
# plot lab calibration locations (so excluding field estimates)
(m_target_cal_lab <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_sf(data = sf_regmat_target %>%
filter(split %in% "Calibration") %>%
filter(BIS_type %in% "lab"),
color = "#d95f02", alpha = 0.5, shape = 19, size = 0.25) +
geom_text(aes(x = Inf, y = -Inf, label = n_sites_cal_lab),
size = 3, hjust = 7, vjust = -32, parse = TRUE) +
ggtitle("Calibration locations (lab)") +
theme(legend.position = c(0.1, 0.8),
plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# save to disk
# ggsave(filename = paste0("m_", TARGET, "_spatial_point_cal_lab.pdf"),
# plot = m_target_cal_lab,
# path = "out/figs/explorative/SOM_per",
# width = 5, height = 5)
# number of validation locations
n_sites_val = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Validation") %>%
filter(BIS_type %in% "lab") %>%
group_by(X,Y) %>%
tally() %>%
nrow())))
# plot validation locations (only want to validate with lab measurements)
(m_target_val <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_sf(data = sf_regmat_target %>%
filter(split %in% "Validation") %>%
filter(BIS_type %in% "lab"),
color = "#d95f02", alpha = 0.5, shape = 25, size = 0.5) +
geom_text(aes(x = Inf, y = -Inf, label = n_sites_val),
size = 3, hjust = 7, vjust = -32, parse = TRUE) +
ggtitle("Validation locations") +
theme(legend.position = c(0.1, 0.8),
plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()))
# save to disk
# ggsave(filename = paste0("m_", TARGET, "_spatial_point_val.pdf"),
# plot = m_target_val,
# path = "out/figs/explorative/SOM_per",
# width = 5, height = 5)
```
## Histograms
```{r plot cal and val data histograms, echo = FALSE, warning = FALSE, message = FALSE}
# number of calibration samples
n_cal_samples = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
nrow())))
# number of validation samples
n_val_samples = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Validation") %>%
nrow())))
tbl_n_samples <- tibble(split = c("Calibration", "Validation"),
n_samples = c(n_cal_samples, n_val_samples))
# histogram of all data split by dataset (cal vs. val)
(p_hist_cal_val <- tbl_regmat_target %>%
ggplot(aes(SOM_per, color = split)) +
geom_histogram(binwidth = 0.1, fill = "white") +
scale_y_continuous() +
scale_x_continuous(breaks = XY_BREAKS_CAL,
limits = c(XY_MIN_CAL - 0.01 * XY_RANGE_CAL,
XY_MAX_CAL + 0.01 * XY_RANGE_CAL)) +
labs(x = TARGET_EXP, y = "Count") +
facet_wrap(~ split) +
scale_color_manual(values = c("black", "blue")) +
labs(col = "Split") +
geom_text(data = tbl_n_samples,
aes(x = Inf, y = -Inf, label = n_samples),
size = 3, hjust = c(5.8, 6.6), vjust = c(-13, -13), parse = TRUE) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
axis.title.x = element_blank(),
legend.position = "none"))
# histogram of all lab measurements split by dataset (cal vs. val)
(p_hist_cal_val_lab <- tbl_regmat_target %>%
filter(BIS_type == "lab") %>%
ggplot(aes(SOM_per, color = split)) +
geom_histogram(binwidth = 0.1, fill = "white") +
scale_y_continuous() +
scale_x_continuous(breaks = XY_BREAKS_CAL,
limits = c(XY_MIN_CAL - 0.01 * XY_RANGE_CAL,
XY_MAX_CAL + 0.01 * XY_RANGE_CAL)) +
labs(x = TARGET_EXP, y = "Count (lab measurements)") +
facet_wrap(~ split) +
scale_color_manual(values = c("black", "blue")) +
labs(col = "Split") +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
axis.title.x = element_blank(),
legend.position = "none"))
```
## Boxplots
```{r plot cal and val data boxplots, echo = FALSE, warning = FALSE, message = FALSE}
tbl_regmat_target <- tbl_regmat_target %>%
filter(d_mid < 200) %>%
mutate(d_gsm = cut(d_mid,
breaks = c(0, 5, 15, 30, 60, 100, 200),
labels = c("0-5", "5-15", "15-30", "30-60", "60-100", "100-200"),
right = FALSE))
# counts per split and depth increment
tbl_calval_counts <- tbl_regmat_target %>%
group_by(split, d_gsm) %>%
mutate(count = n()) %>%
distinct(count) %>%
arrange(d_gsm, split) %>%
mutate(n = as.character(as.expression(paste0("italic(n) == ", count))))
# assign levels to depth increments so that they are in reverse order
depth_order <- c("100-200", "60-100", "30-60", "15-30", "5-15", "0-5")
tbl_regmat_target$d_gsm <- factor(x = tbl_regmat_target$d_gsm,
levels = depth_order)
# boxplots split by dataset
(p_boxplot_cal_val <- tbl_regmat_target %>%
ggplot(aes(x = pull(tbl_regmat_target, TARGET), y = d_gsm, color = split)) +
geom_boxplot(outlier.shape = 21) +
scale_color_manual(values = c("black", "blue")) +
facet_wrap(~ split) +
xlab(as.expression(paste(TARGET_EXP))) +
ylab(expression("Depth [cm]")) +
scale_x_continuous(breaks = XY_BREAKS_CAL,
limits = c(XY_MIN_CAL - 0.01 * XY_RANGE_CAL,
XY_MAX_CAL + 0.01 * XY_RANGE_CAL)) +
geom_text(data = tbl_calval_counts,
aes(x = Inf, y = -Inf, label = n),
size = 3,
hjust = rep(1.05, 12),
vjust = c(-13.25, # cal 0-5
-14, # val 0-5
-10.5, # cal 5-15
-11, # val 5-15
-8, # cal 15-30
-8, # val 15-30
-5.5, # cal 30-60
-5.5, # val 30-60
-2.75, # cal 60-100
-2.8, # val 60-100
-0.6, # cal 100-200
-0.6), # val 100-200
parse = TRUE) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.position = "none"))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_boxplots_cal_val_d.pdf"),
# plot = p_boxplot_cal_val,
# path = "out/figs/explorative/SOM_per",
# width = 10, height = 5)
```
```{r combine plots using cowplot, echo = FALSE, warning = FALSE, message = FALSE}
# combine descriptive plots using cowplot
p_target_descriptive <- plot_grid(plot_grid(m_target_cal, m_target_val,
align = "hv", nrow = 1, ncol = 2),
p_hist_cal_val,
p_boxplot_cal_val,
nrow = 3, ncol = 1,
align = "v", axis = "l",
rel_heights = c(0.5, 0.25, 0.25),
# rel_heights = c(3, 1, 1),
rel_widths = c(1.2, 1, 1.2))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_descriptive.pdf"),
# plot = p_target_descriptive,
# path = "out/figs/explorative/SOM_per",
# width = 8, height = 8)
```
## SOM density plots over depth
```{r density plot over depth, echo = FALSE, warning = FALSE, message = FALSE}
# plot density over depth of all observations (calibration and validation)
(p_target_density_d <- ggplot() +
theme_bw() +
geom_hex(data = tbl_regmat_target %>%
filter(d_mid <= 200),
aes(x = SOM_per, y = d_mid), bins = 50) +
scale_fill_viridis(option = "inferno", direction = -1) +
scale_y_continuous(trans = "reverse") +
xlab("SOM [%]") +
ylab("Depth [cm]") +
labs(fill = "Number of observations"))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_density_depth.pdf"),
# plot = p_target_density_d,
# path = "out/figs/explorative/SOM_per",
# width = 8, height = 5)
# plot density over depth of all observations (calibration and validation)
(p_target_density_d_lab <- ggplot() +
theme_bw() +
geom_hex(data = tbl_regmat_target %>%
filter(BIS_type %in% "lab") %>%
filter(d_mid <= 200),
aes(x = SOM_per, y = d_mid), bins = 50) +
scale_fill_viridis(option = "inferno", direction = -1) +
scale_y_continuous(trans = "reverse") +
xlab("SOM [%]") +
ylab("Depth [cm]") +
labs(fill = "Number of lab measurements"))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_density_depth_lab.pdf"),
# plot = p_target_density_d_lab,
# path = "out/figs/explorative/SOM_per",
# width = 8, height = 5)
```
## Year of measurement (age)
```{r barchart of target measurement age, echo = FALSE, warning = FALSE, message = FALSE}
# set order of datasets so we always show calibration first, then validation
dataset_order <- c("Validation", "Calibration")
tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
levels = dataset_order)
# plot barchart of ages of soil observations
(p_target_age <- tbl_regmat_target %>%
ggplot(aes(year, fill = split)) +
geom_histogram(stat = "count") +
labs(x = "Year", y = "Count") +
scale_fill_manual(values = c( "blue", "black")) +
scale_x_continuous(breaks = seq(1960, 2020, 10)) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.title = element_blank()))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_age.pdf"),
# plot = p_target_age,
# path = paste0("out/figs/explorative/", TARGET),
# width = 8, height = 4)
# plot barchart of ages of soil observations colored by observation type
(p_target_age_type <- tbl_regmat_target %>%
mutate(BIS_type = case_when(BIS_type %in% "field" ~ "Field estimate",
BIS_type %in% "lab" ~ "Lab measurement")) %>%
ggplot(aes(year, fill = BIS_type)) +
geom_histogram(stat = "count") +
labs(x = "Year", y = "Number of SOM observations") +
scale_fill_manual(values = c("#1b9e77", "#d95f02")) +
scale_x_continuous(breaks = seq(1960, 2020, 10)) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.title = element_blank()))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_age_type.pdf"),
# plot = p_target_age_type,
# path = paste0("out/figs/explorative/", TARGET),
# width = 8, height = 4)
# same as above but log axis and without legend
(p_target_age_type_log <- tbl_regmat_target %>%
mutate(BIS_type = case_when(BIS_type %in% "field" ~ "Field estimate",
BIS_type %in% "lab" ~ "Lab measurement")) %>%
ggplot(aes(year, fill = BIS_type)) +
geom_histogram(stat = "count") +
labs(x = "Year", y = "Number of SOM observations") +
scale_fill_manual(values = c("#1b9e77", "#d95f02")) +
scale_x_continuous(breaks = seq(1960, 2020, 10)) +
scale_y_log10() +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.position = "none"))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_age_type_log.pdf"),
# plot = p_target_age_type_log,
# path = paste0("out/figs/explorative/", TARGET),
# width = 8, height = 4)
# plot barchart of ages of lab measurements only
(p_target_age_lab <- tbl_regmat_target %>%
filter(BIS_type %in% "lab") %>%
ggplot(aes(year, fill = split)) +
geom_histogram(stat = "count") +
labs(x = "Year", y = "Count") +
scale_fill_manual(values = c( "blue", "black")) +
scale_x_continuous(breaks = seq(1960, 2020, 10)) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.title = element_blank()))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_age_lab.pdf"),
# plot = p_target_age_lab,
# path = paste0("out/figs/explorative/", TARGET),
# width = 8, height = 4)
```
```{r cowplot point map and age histogram, include = FALSE, echo = FALSE, warning = FALSE}
# combine descriptive plots using cowplot
p_target_point_hist <- plot_grid(m_target,
p_target_age_type_log,
nrow = 1, ncol = 2,
align = "v", axis = "l",
rel_widths = c(1, 1), labels = c("a)", "b)"))
# save to disk (pdf)
# ggsave(filename = paste0("p_", TARGET, "_spatial_point_age_type.pdf"),
# plot = p_target_point_hist,
# path = "out/figs/explorative/SOM_per",
# width = 8, height = 4)
# save to disk (png)
ggsave(filename = paste0("p_", TARGET, "_spatial_point_age_type.png"),
plot = p_target_point_hist,
path = "out/figs/explorative/SOM_per",
width = 8, height = 4)
```
## Maps of observations at calibration locations: top- & subsoil
```{r soil property point data, echo = FALSE, warning = FALSE}
# Prepare sf object: upper-most sample of calibration dataset
sf_regmat_target_cal_top <- tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
group_by(X, Y) %>%
slice(1L) %>% # slice by location to only get topsoil observations
ungroup %>%
st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
# Prepare sf object: lower-most sample of calibration dataset
sf_regmat_target_cal_sub <- tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
group_by(X, Y) %>%
slice(tail(row_number(), 1)) %>% # slice by lowest sample at each location
ungroup %>%
st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
# gather number of locations for displaying on map
n <- as.character(as.expression(paste0("italic(n) == ", nrow(sf_regmat_target_cal_top))))
# map target variable sampling locations for topsoil values
(m_target_locations_cal_top <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_sf(data = sf_regmat_target_cal_top,
aes(color = SOM_per), alpha = 0.5, shape = 19, size = 0.25) +
scale_color_cmocean(name = "solar", direction = -1) +
geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
hjust = 13, vjust = -55, parse = TRUE) +
theme(legend.position = c(0.1, 0.8),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(col = "Topsoil SOM [%]") +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# ggsave(filename = "m_SOM_per_topsoil_cal_locations.pdf",
# m_target_locations_cal_top,
# path = "out/figs/explorative/SOM_per",
# height = 8, width = 8)
# map target variable sampling locations for subsoil values
(m_target_locations_cal_sub <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders, fill = "white") +
geom_sf(data = sf_regmat_target_cal_sub,
aes(color = SOM_per), alpha = 0.5, shape = 19, size = 0.25) +
scale_color_cmocean(name = "solar", direction = -1) +
geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
hjust = 13, vjust = -55, parse = TRUE) +
theme(legend.position = c(0.1, 0.8),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(col = "Subsoil SOM [%]") +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# ggsave(filename = "m_SOM_per_subsoil_cal_locations.pdf",
# m_target_locations_cal_sub,
# path = "out/figs/explorative/SOM_per",
# height = 8, width = 8)
```
## Geostatistical exploratory analysis
### Variogram
```{r variogram, echo = TRUE, warning = FALSE}
library(gstat)
# calculate variogram for topsoil lab observations (exclude field estimates otherwise computation too slow)
vgm_target_cal_top <- variogram(SOM_per ~ 1,
filter(sf_regmat_target_cal_top, BIS_type %in% "lab"))
# fit variogram to model
vgm_fit_target_cal_top <- fit.variogram(vgm_target_cal_top, model = vgm("Exp"))
# plot variogram model
pdf(file = paste0("out/figs/explorative/", TARGET, "/", TARGET, "_lab_cal_variogram.pdf"),
width = 8, height = 4)
plot(vgm_target_cal_top, vgm_fit_target_cal_top)
dev.off()
```
### Nearest observations & distances
```{r nearest distances, echo = TRUE, warning = FALSE}
library(meteo)
tbl_regmat_target_cal_lab <- tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
filter(BIS_type %in% "lab")
# if you want to know nearest distances to sampling locations, use nabor::knn
# knn1 <- nabor::knn(tbl_regmat_target_cal[,c("X","Y")], tbl_regmat_target_cal[,c("X","Y")], k=2)
# dist_all <- as.vector(knn1$nn.dists)
# dist_all <- dist_all[dist_all!=0]
#
# summary(dist_all)
# but for 3D DSM, more applicable to also specify threshold depth to search in
# use 20cm because this was optimal tuning result for both NL and Edgeroi datasets
df_near_obs_3D <- near.obs.soil(locations = as.data.frame(tbl_regmat_target_cal_lab),
locations.x.y.md = c("X", "Y", "d_mid"),
observations = as.data.frame(tbl_regmat_target_cal_lab),
observations.x.y.md = c("X", "Y", "d_mid", "SOM_per"),
zcol = "SOM_per",
n.obs = 1,
depth.range = 20,
no.obs = "increase")
summary(df_near_obs_3D)
```
This diff is collapsed.
---
title: "Exploratory Analysis of Modelling Data"
subtitle: "Soil pH [KCl]"
author: "Anatol Helfenstein"
date: "`r Sys.Date()`"
output:
html_document:
toc: yes
toc_float: yes
'': default
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = FALSE)
options(width = 100) # sets width of R code output (not images)
```
```{r load required pkgs and data, include = FALSE}
# load packages
pkgs <- c("tidyverse", "raster", "rgdal", "sf", "rasterVis", "viridis",
"foreach", "ggspatial", "cowplot")
lapply(pkgs, library, character.only = TRUE)
# 1) Specify DSM target soil property (response):
TARGET = "pH_KCl"
TARGET_EXP = "pH [KCl]"
# 2) Read in regression matrix specific to target soil property
tbl_regmat_target <- read_rds(paste0("out/data/model/", TARGET, "/tbl_regmat_",
TARGET, ".Rds"))
# remane "train" and "test" to match other plots in paper and specify number
tbl_regmat_target <- tbl_regmat_target %>%
mutate(split = case_when(split %in% "train" ~ "Calibration",
split %in% "test" ~ "Validation"))
# set order of datasets so we always show calibration first, then validation
dataset_order <- c("Calibration", "Validation")
tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
levels = dataset_order)
# convert to sf
sf_regmat_target <- tbl_regmat_target %>%
st_as_sf(., coords = c("X", "Y"), crs = "EPSG:28992")
# 3) Set plotting axis min, max, range and breaks
XY_MIN = min(tbl_regmat_target[TARGET])
XY_MAX = max(tbl_regmat_target[TARGET])
XY_RANGE = diff(range(XY_MIN, XY_MAX))
XY_BREAKS = unique(round(seq(XY_MIN, XY_MAX, XY_RANGE/10)))
# 4) Read in NL border shapefile for mapping
sf_NL_borders <- st_read("data/other/NL_borders.shp")
```
## Maps
```{r map cal and val data locations, echo = FALSE, warning = FALSE, message = FALSE}
# number of calibration locations
n_cal_sites = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
group_by(X,Y) %>%
tally() %>%
nrow())))
# number of validation locations
n_val_sites = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Validation") %>%
group_by(X,Y) %>%
tally() %>%
nrow())))
# plot calibration locations
(m_target_cal <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders) +
geom_sf(data = sf_regmat_target %>%
filter(split %in% "Calibration"),
color = "black", shape = 21, size = 0.5) +
geom_text(aes(x = Inf, y = -Inf, label = n_cal_sites),
size = 3, hjust = 6, vjust = -32, parse = TRUE) +
ggtitle("Calibration (PFB)") +
theme(legend.position = c(0.1, 0.8),
plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
style = north_arrow_fancy_orienteering,
height = unit(1, "cm"), width = unit(1, "cm")))
# plot validation locations
(m_target_val <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders) +
geom_sf(data = sf_regmat_target %>%
filter(split %in% "Validation"),
color = "blue", shape = 25, size = 0.5) +
geom_text(aes(x = Inf, y = -Inf, label = n_val_sites),
size = 3, hjust = 6, vjust = -32, parse = TRUE) +
ggtitle("Validation (LSK)") +
theme(legend.position = c(0.1, 0.8),
plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()))
```
## Histograms
```{r plot cal and val data histograms, echo = FALSE, warning = FALSE, message = FALSE}
# number of calibration locations
n_cal_samples = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
nrow())))
# number of validation locations
n_val_samples = as.character(as.expression(paste0(
"italic(n) == ",
tbl_regmat_target %>%
filter(split %in% "Validation") %>%
nrow())))
tbl_n_samples <- tibble(split = c("Calibration", "Validation"),
n_samples = c(n_cal_samples, n_val_samples))
# histogram of all data split by dataset
(p_hist_cal_val <- tbl_regmat_target %>%
ggplot(aes(pH_KCl, color = split)) +
geom_histogram(binwidth = 0.1, fill = "white") +
scale_y_continuous() +
scale_x_continuous(breaks = XY_BREAKS,
limits = c(XY_MIN - 0.01 * XY_RANGE,
XY_MAX + 0.01 * XY_RANGE)) +
labs(x = TARGET_EXP, y = "Count") +
facet_wrap(~ split) +
scale_color_manual(values = c("black", "blue")) +
labs(col = "Split") +
geom_text(data = tbl_n_samples,
aes(x = Inf, y = -Inf, label = n_samples),
size = 3, hjust = c(5.8, 6.6), vjust = c(-13, -13), parse = TRUE) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
axis.title.x = element_blank(),
legend.position = "none"))
```
## Boxplots
```{r plot cal and val data boxplots, echo = FALSE, warning = FALSE, message = FALSE}
tbl_regmat_target <- tbl_regmat_target %>%
filter(d_mid < 200) %>%
mutate(d_gsm = cut(d_mid,
breaks = c(0, 5, 15, 30, 60, 100, 200),
labels = c("0-5", "5-15", "15-30", "30-60", "60-100", "100-200"),
right = FALSE))
# counts per split and depth increment
tbl_calval_counts <- tbl_regmat_target %>%
group_by(split, d_gsm) %>%
mutate(count = n()) %>%
distinct(count) %>%
arrange(d_gsm, split) %>%
mutate(n = as.character(as.expression(paste0("italic(n) == ", count))))
# assign levels to depth increments so that they are in reverse order
depth_order <- c("100-200", "60-100", "30-60", "15-30", "5-15", "0-5")
tbl_regmat_target$d_gsm <- factor(x = tbl_regmat_target$d_gsm,
levels = depth_order)
# boxplots split by dataset
(p_boxplot_cal_val <- tbl_regmat_target %>%
ggplot(aes(x = pull(tbl_regmat_target, TARGET), y = d_gsm, color = split)) +
geom_boxplot(outlier.shape = 21) +
scale_color_manual(values = c("black", "blue")) +
facet_wrap(~ split) +
xlab(as.expression(paste(TARGET_EXP))) +
ylab(expression("Depth [cm]")) +
scale_x_continuous(breaks = XY_BREAKS,
limits = c(XY_MIN - 0.01 * XY_RANGE,
XY_MAX + 0.01 * XY_RANGE)) +
geom_text(data = tbl_calval_counts,
aes(x = Inf, y = -Inf, label = n),
size = 3,
hjust = rep(1.05, 12),
vjust = c(-13.25, # cal 0-5
-14, # val 0-5
-10.5, # cal 5-15
-11, # val 5-15
-8, # cal 15-30
-8, # val 15-30
-5.5, # cal 30-60
-5.5, # val 30-60
-2.75, # cal 60-100
-2.8, # val 60-100
-0.6, # cal 100-200
-0.6), # val 100-200
parse = TRUE) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.position = "none"))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_boxplots_cal_val_d.pdf"),
# plot = p_boxplot_cal_val,
# path = paste0("out/figs/explorative/", TARGET),
# width = 10, height = 5)
```
```{r combine plots using cowplot, echo = FALSE, warning = FALSE, message = FALSE}
# combine descriptive plots using cowplot
p_target_descriptive <- plot_grid(plot_grid(m_target_cal, m_target_val,
align = "hv", nrow = 1, ncol = 2),
p_hist_cal_val,
p_boxplot_cal_val,
nrow = 3, ncol = 1,
align = "v", axis = "l",
rel_heights = c(0.5, 0.25, 0.25),
# rel_heights = c(3, 1, 1),
rel_widths = c(1.2, 1, 1.2))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_descriptive.pdf"),
# plot = p_target_descriptive,
# path = paste0("out/figs/explorative/", TARGET),
# width = 8, height = 8)
```
## Year of measurement (age)
```{r barchart of target measurement age, echo = FALSE, warning = FALSE, message = FALSE}
# set order of datasets so we always show calibration first, then validation
dataset_order <- c("Validation", "Calibration")
tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
levels = dataset_order)
# plot barchart of ages of soil measurements
(p_target_age <- tbl_regmat_target %>%
ggplot(aes(year, fill = split)) +
geom_histogram(stat = "count") +
labs(x = "Year", y = "Count") +
scale_fill_manual(values = c( "blue", "black")) +
scale_x_continuous(breaks = seq(1960, 2010, 10)) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.title = element_blank()))
# save to disk
# ggsave(filename = paste0("p_", TARGET, "_age.pdf"),
# plot = p_target_age,
# path = paste0("out/figs/explorative/", TARGET),
# width = 8, height = 4)
```
## Maps of observations at calibration locations: top- & subsoil
```{r soil property point data, echo = FALSE, warning = FALSE}
# Prepare sf object: upper-most sample of calibration dataset
sf_regmat_target_cal_top <- tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
group_by(X, Y) %>%
slice(1L) %>% # slice by location to only get topsoil observations
ungroup %>%
st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
# Prepare sf object: lower-most sample of calibration dataset
sf_regmat_target_cal_sub <- tbl_regmat_target %>%
filter(split %in% "Calibration") %>%
group_by(X, Y) %>%
slice(tail(row_number(), 1)) %>% # slice by lowest sample at each location
ungroup %>%
st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
# gather number of locations for displaying on map
n <- as.character(as.expression(paste0("italic(n) == ", nrow(sf_regmat_target_cal_top))))
# define range in order to use identical color scheme for top- and subsoil
min <- if (min(sf_regmat_target_cal_top$pH_KCl) < min(sf_regmat_target_cal_sub$pH_KCl)) {
min(sf_regmat_target_cal_top$pH_KCl)} else {min(sf_regmat_target_cal_sub$pH_KCl)}
max <- if (max(sf_regmat_target_cal_top$pH_KCl) > max(sf_regmat_target_cal_sub$pH_KCl)) {
max(sf_regmat_target_cal_top$pH_KCl)} else {max(sf_regmat_target_cal_sub$pH_KCl)}
# map target variable sampling locations for topsoil values
(m_target_locations_cal_top <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders) +
geom_sf(data = sf_regmat_target_cal_top, aes(color = pH_KCl)) +
scale_fill_viridis_c(aesthetics = "color", option = "inferno",
limits = c(min, max)) + # or plasma
geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
hjust = 13, vjust = -55, parse = TRUE) +
theme(legend.position = c(0.1, 0.8),
panel.border = element_blank(),
panel.background = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(col = "Topsoil pH [KCl]") +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"),
style = north_arrow_fancy_orienteering))
# ggsave("out/maps/explorative/pH_KCl/m_pH_KCl_topsoil_locations.pdf",
# m_target_locations_top,
# height = 8,
# width = 8)
# map target variable sampling locations for subsoil values
(m_target_locations_cal_sub <- ggplot() +
theme_bw() +
geom_sf(data = sf_NL_borders) +
geom_sf(data = sf_regmat_target_cal_sub, aes(color = pH_KCl)) +
scale_fill_viridis_c(aesthetics = "color", option = "inferno",
limits = c(min, max)) + # or plasma
geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
hjust = 13, vjust = -55, parse = TRUE) +
theme(legend.position = c(0.1, 0.8),
panel.border = element_blank(),
panel.background = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(col = "Subsoil pH [KCl]") +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"),
style = north_arrow_fancy_orienteering))
# ggsave("out/maps/explorative/pH_KCl/m_pH_KCl_subsoil_locations.pdf",
# m_target_locations_sub,
# height = 8,
# width = 8)
```
## Geostatistical exploratory analysis
### Variogram
```{r variogram, echo = TRUE, warning = FALSE}
library(gstat)
# calculate variogram for topsoil observations
vgm_target_cal_top <- variogram(pH_KCl ~ 1, sf_regmat_target_cal_top)
# fit variogram to model
vgm_fit_target_cal_top <- fit.variogram(vgm_target_cal_top, model = vgm("Exp"))
# plot variogram model
# pdf(file = paste0("out/figs/explorative/", TARGET, "/", TARGET, "_cal_variogram.pdf"), width = 8, height = 4)
plot(vgm_target_cal_top, vgm_fit_target_cal_top)
# dev.off()
```
### Nearest observations & distances
```{r nearest distances, echo = TRUE, warning = FALSE}
library(meteo)
tbl_regmat_target_cal <- filter(tbl_regmat_target, split %in% "Calibration")
# if you want to know nearest distances to sampling locations, use nabor::knn
# knn1 <- nabor::knn(tbl_regmat_target_cal[,c("X","Y")], tbl_regmat_target_cal[,c("X","Y")], k=2)
# dist_all <- as.vector(knn1$nn.dists)
# dist_all <- dist_all[dist_all!=0]
#
# summary(dist_all)
# but for 3D DSM, more applicable to also specify threshold depth to search in
# use 20cm because this was optimal tuning result for both NL and Edgeroi datasets
df_near_obs_3D <- near.obs.soil(locations = as.data.frame(tbl_regmat_target_cal),
locations.x.y.md = c("X", "Y", "d_mid"),
observations = as.data.frame(tbl_regmat_target_cal),
observations.x.y.md = c("X", "Y", "d_mid", "pH_KCl"),
zcol = "pH_KCl",
n.obs = 1,
depth.range = 20,
no.obs = "increase")
summary(df_near_obs_3D)
```
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment