exploratory analysis files of pH and SOM modelling data; not fully...

exploratory analysis files of pH and SOM modelling data; not fully reproducible at the moment because would require SOM field estimates and independent LSK and CCNL datasets; just to give idea of ways to explore data distribution before beginning model training

exploratory analysis files of pH and SOM modelling data; not fully...
3d1d986c · Helfenstein, Anatol · 53b3aaca · 3d1d986c · 3d1d986c · 3d1d986c
Commit 3d1d986c authored 1 year ago by Helfenstein, Anatol
--- a/35_model_data_expl_analysis_SOM.Rmd
+++ b/35_model_data_expl_analysis_SOM.Rmd
+---
+title: "Exploratory Analysis of Modelling Data"
+subtitle: "SOM [%]"
+author: "Anatol Helfenstein"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: yes
+    toc_float: yes
+  '': default
+---
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+options(width = 100) # sets width of R code output (not images)
+```
+```{r load required pkgs and data, include = FALSE}
+# load packages
+pkgs <- c("tidyverse", "raster", "rgdal", "sf", "rasterVis", "viridis",
+          "foreach", "ggspatial", "cowplot", "cmocean", "scales")
+lapply(pkgs, library, character.only = TRUE)
+# 1) Specify DSM target soil property:
+TARGET = "SOM_per"
+TARGET_EXP = "SOM [%]"
+# 2) Specify observation quality & dynamic or static model
+OBS_QUAL = "" # lab measurements & field estimates: ""; lab meas. only: "_lab"
+TIME = "_dyn" # calibration 3D+T model using 2D+T & 3D+T dynamic covariates
+TIME_DIR = "dynamic" # either "static" or "dynamic"; directory to save plots
+# read in regression matrix of TARGET soil property
+tbl_regmat_target <- read_rds(paste0("out/data/model/", TARGET, "/", TIME_DIR,
+                                     "/tbl_regmat_", TARGET, OBS_QUAL, TIME, ".Rds"))
+# remane "train" and "test" to match other plots in paper and specify number
+tbl_regmat_target <- tbl_regmat_target %>%
+  mutate(split = case_when(split %in% "train" ~ "Calibration",
+                           split %in% "test" ~ "Validation"))
+# set order of datasets so we always show calibration first, then validation
+dataset_order <- c("Calibration", "Validation")
+tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
+                                 levels = dataset_order)
+# convert to sf
+sf_regmat_target <- tbl_regmat_target %>% 
+  st_as_sf(., coords = c("X", "Y"), crs = "EPSG:28992")
+# 3) Set plotting axis min, max, range and breaks
+XY_MIN_CAL = 0
+XY_MAX_CAL = 100
+XY_RANGE_CAL = diff(range(XY_MIN_CAL, XY_MAX_CAL))
+XY_BREAKS_CAL = unique(round(seq(XY_MIN_CAL, XY_MAX_CAL, XY_RANGE_CAL/10)))
+# validation/test data
+XY_MIN_VAL = 0
+XY_MAX_VAL = 100
+XY_RANGE_VAL = diff(range(XY_MIN_VAL, XY_MAX_VAL))
+XY_BREAKS_VAL = unique(round(seq(XY_MIN_VAL, XY_MAX_VAL, XY_RANGE_VAL/10)))
+# 4) Read in NL border shapefile for mapping
+sf_NL_borders <- st_read("data/other/NL_borders.shp")
+```
+## Maps
+### Spatial density maps
+```{r map spatial density, echo = FALSE, warning = FALSE, message = FALSE}
+# number of locations (calibration and validation)
+n_sites = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    group_by(X,Y) %>%
+    tally() %>%
+    nrow())))
+# plot spatial density plot of all observations (calibration and validation)
+(m_target_density <- ggplot() +
+    theme_bw() +
+    geom_sf(data = sf_NL_borders, fill = "white") +
+    geom_hex(data = tbl_regmat_target,
+             aes(x = X, y = Y), binwidth = 3000) + # 3km binwidth
+    scale_fill_viridis(option = "inferno", direction = -1) +
+    geom_text(aes(x = Inf, y = -Inf, label = n_sites),
+              size = 3, hjust = 6, vjust = -32, parse = TRUE) +
+    theme(legend.position = c(0.1, 0.8),
+          plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
+          panel.border = element_blank(),
+          panel.background = element_blank(),
+          panel.grid.major = element_line(colour = "transparent"),
+          axis.title.x = element_blank(),
+          axis.text.x = element_blank(),
+          axis.ticks.x = element_blank(),
+          axis.title.y = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.y = element_blank()) +
+    labs(fill = "Number of observations") +
+    annotation_scale(location = "bl", width_hint = 0.5) +
+    annotation_north_arrow(location = "bl", which_north = "true", 
+                           pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                           style = north_arrow_fancy_orienteering,
+                           height = unit(1, "cm"), width = unit(1, "cm")))
+# save to disk
+# ggsave(filename = paste0("m_", TARGET, "_spatial_density.pdf"),
+#        plot = m_target_density,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 5, height = 5)
+# number of locations with lab measurements (calibration and validation)
+n_sites_lab = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(BIS_type %in% "lab") %>% 
+    group_by(X,Y) %>%
+    tally() %>%
+    nrow())))
+# plot spatial density plot of all lab measurements (calibration and validation)
+(m_target_density_lab <- ggplot() +
+    theme_bw() +
+    geom_sf(data = sf_NL_borders, fill = "white") +
+    geom_hex(data = filter(tbl_regmat_target, BIS_type %in% "lab"),
+             aes(x = X, y = Y), binwidth = 3000) +
+    scale_fill_viridis(option = "inferno", direction = -1) +
+    geom_text(aes(x = Inf, y = -Inf, label = n_sites_lab),
+              size = 3, hjust = 8, vjust = -30, parse = TRUE) +
+    theme(legend.position = c(0.1, 0.8),
+          panel.border = element_blank(),
+          panel.background = element_blank(),
+          panel.grid.major = element_line(colour = "transparent"),
+          axis.title.x = element_blank(),
+          axis.text.x = element_blank(),
+          axis.ticks.x = element_blank(),
+          axis.title.y = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.y = element_blank()) +
+    labs(fill = "Number of lab \n measurements") +
+    annotation_scale(location = "bl", width_hint = 0.5) +
+    annotation_north_arrow(location = "bl", which_north = "true", 
+                           pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                           style = north_arrow_fancy_orienteering,
+                           height = unit(1, "cm"), width = unit(1, "cm")))
+# save to disk
+# ggsave(filename = paste0("m_", TARGET, "_spatial_density_lab.pdf"),
+#        plot = m_target_density_lab,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 5, height = 5)
+```
+### Spatial point maps
+```{r map spatial points cal val, echo = FALSE, warning = FALSE, message = FALSE}
+# number of locations
+n_sites = as.character(as.expression(paste0(
+  "italic(n) == ", tbl_regmat_target %>%
+    group_by(X,Y) %>% 
+    slice(1L) %>% 
+    nrow()
+  )))
+# convert to sf
+sf_regmat_target_locations <- tbl_regmat_target %>%
+  # also group by type to not remove lab measurement locations where there are field
+  # estimates of horizons for which there are no lab measurements (see script 31 -
+  # removal of field estimates...)
+  group_by(X, Y, BIS_type) %>% 
+  slice(1L) %>% 
+  st_as_sf(., coords = c("X", "Y"), crs = "EPSG:28992") %>% 
+  # change names for plot labels
+  mutate(BIS_type = case_when(BIS_type %in% "field" ~ "Field estimate",
+                              BIS_type %in% "lab" ~ "Lab measurement"))
+# plot all locations
+(m_target <- ggplot() +
+    theme_bw() +
+    geom_sf(data = sf_NL_borders, fill = "white") +
+    geom_sf(data = sf_regmat_target_locations,
+            aes(color = BIS_type), alpha = 0.5, shape = 19, size = 0.25) +
+    scale_color_manual(values = c("#1b9e77", "#d95f02")) +
+    geom_text(aes(x = Inf, y = -Inf, label = n_sites),
+              size = 3, hjust = 5, vjust = -25, parse = TRUE) +
+    labs(col = "SOM observations") +
+    guides(color = guide_legend(override.aes = list(alpha = 1, size = 3))) +
+    theme(legend.position = c(0.1, 0.8),
+          plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
+          panel.border = element_blank(),
+          panel.background = element_blank(),
+          panel.grid.major = element_line(colour = "transparent"),
+          axis.title.x = element_blank(),
+          axis.text.x = element_blank(),
+          axis.ticks.x = element_blank(),
+          axis.title.y = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.y = element_blank()) +
+    annotation_scale(location = "bl", width_hint = 0.5) +
+    annotation_north_arrow(location = "bl", which_north = "true", 
+                           pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                           style = north_arrow_fancy_orienteering,
+                           height = unit(1, "cm"), width = unit(1, "cm")))
+# save to disk
+# ggsave(filename = paste0("m_", TARGET, "_spatial_point.pdf"),
+#        plot = m_target,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 5, height = 5)
+# number of calibration locations
+n_sites_cal = as.character(as.expression(paste0(
+  "italic(n) == ", tbl_regmat_target %>%
+    filter(split %in% "Calibration") %>%
+    group_by(X,Y) %>% 
+    slice(1L) %>% 
+    nrow()
+  )))
+# plot calibration locations
+(m_target_cal <- ggplot() +
+    theme_bw() +
+    geom_sf(data = sf_NL_borders, fill = "white") +
+    geom_sf(data = sf_regmat_target_locations %>%
+                filter(split %in% "Calibration"),
+            aes(color = BIS_type), alpha = 0.5, shape = 19, size = 0.25) +
+    scale_color_manual(values = c("#1b9e77", "#d95f02")) +
+    geom_text(aes(x = Inf, y = -Inf, label = n_sites_cal),
+              size = 3, hjust = 6, vjust = -32, parse = TRUE) +
+    # ggtitle("Calibration") +
+    theme(legend.position = c(0.1, 0.8),
+          plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
+          panel.border = element_blank(),
+          panel.background = element_blank(),
+          panel.grid.major = element_line(colour = "transparent"),
+          axis.title.x = element_blank(),
+          axis.text.x = element_blank(),
+          axis.ticks.x = element_blank(),
+          axis.title.y = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.y = element_blank()) +
+    labs(col = "Calibration locations") +
+    annotation_scale(location = "bl", width_hint = 0.5) +
+    annotation_north_arrow(location = "bl", which_north = "true", 
+                           pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                           style = north_arrow_fancy_orienteering,
+                           height = unit(1, "cm"), width = unit(1, "cm")))
+# save to disk
+# ggsave(filename = paste0("m_", TARGET, "_spatial_point_cal.pdf"),
+#        plot = m_target_cal,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 5, height = 5)
+# number of lab calibration locations (so excluding field estimates)
+n_sites_cal_lab = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Calibration") %>%
+    filter(BIS_type %in% "lab") %>% 
+    group_by(X,Y) %>%
+    tally() %>%
+    nrow())))
+# plot lab calibration locations (so excluding field estimates)
+(m_target_cal_lab <- ggplot() +
+  theme_bw() +
+  geom_sf(data = sf_NL_borders, fill = "white") +
+  geom_sf(data = sf_regmat_target %>% 
+            filter(split %in% "Calibration") %>% 
+            filter(BIS_type %in% "lab"),
+          color = "#d95f02", alpha = 0.5, shape = 19, size = 0.25) +
+  geom_text(aes(x = Inf, y = -Inf, label = n_sites_cal_lab),
+            size = 3, hjust = 7, vjust = -32, parse = TRUE) +
+  ggtitle("Calibration locations (lab)") +
+  theme(legend.position = c(0.1, 0.8),
+        plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
+        panel.border = element_blank(),
+        panel.background = element_blank(),
+        panel.grid.major = element_line(colour = "transparent"),
+        axis.title.x = element_blank(),
+        axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank()) +
+  annotation_scale(location = "bl", width_hint = 0.5) +
+  annotation_north_arrow(location = "bl", which_north = "true", 
+                         pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                         style = north_arrow_fancy_orienteering,
+                         height = unit(1, "cm"), width = unit(1, "cm")))
+# save to disk
+# ggsave(filename = paste0("m_", TARGET, "_spatial_point_cal_lab.pdf"),
+#        plot = m_target_cal_lab,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 5, height = 5)
+# number of validation locations
+n_sites_val = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Validation") %>%
+    filter(BIS_type %in% "lab") %>% 
+    group_by(X,Y) %>%
+    tally() %>%
+    nrow())))
+# plot validation locations (only want to validate with lab measurements)
+(m_target_val <- ggplot() +
+  theme_bw() +
+  geom_sf(data = sf_NL_borders, fill = "white") +
+  geom_sf(data = sf_regmat_target %>% 
+            filter(split %in% "Validation") %>% 
+            filter(BIS_type %in% "lab"),
+          color = "#d95f02", alpha = 0.5, shape = 25, size = 0.5) +
+  geom_text(aes(x = Inf, y = -Inf, label = n_sites_val),
+            size = 3, hjust = 7, vjust = -32, parse = TRUE) +
+  ggtitle("Validation locations") +
+  theme(legend.position = c(0.1, 0.8),
+        plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
+        panel.border = element_blank(),
+        panel.background = element_blank(),
+        panel.grid.major = element_line(colour = "transparent"),
+        axis.title.x = element_blank(),
+        axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank()))
+# save to disk
+# ggsave(filename = paste0("m_", TARGET, "_spatial_point_val.pdf"),
+#        plot = m_target_val,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 5, height = 5)
+```
+## Histograms
+```{r plot cal and val data histograms, echo = FALSE, warning = FALSE, message = FALSE}
+# number of calibration samples
+n_cal_samples = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Calibration") %>%
+    nrow())))
+# number of validation samples
+n_val_samples = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Validation") %>%
+    nrow())))
+tbl_n_samples <- tibble(split = c("Calibration", "Validation"),
+                        n_samples = c(n_cal_samples, n_val_samples))
+# histogram of all data split by dataset (cal vs. val)
+(p_hist_cal_val <- tbl_regmat_target %>%
+    ggplot(aes(SOM_per, color = split)) +
+    geom_histogram(binwidth = 0.1, fill = "white") +
+    scale_y_continuous() +
+    scale_x_continuous(breaks = XY_BREAKS_CAL,
+                       limits = c(XY_MIN_CAL - 0.01 * XY_RANGE_CAL,
+                                  XY_MAX_CAL + 0.01 * XY_RANGE_CAL)) +
+    labs(x = TARGET_EXP, y = "Count") + 
+    facet_wrap(~ split) +
+    scale_color_manual(values = c("black", "blue")) +
+    labs(col = "Split") +
+    geom_text(data = tbl_n_samples,
+              aes(x = Inf, y = -Inf, label = n_samples),
+              size = 3, hjust = c(5.8, 6.6), vjust = c(-13, -13), parse = TRUE) +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          axis.title.x = element_blank(),
+          legend.position = "none"))
+# histogram of all lab measurements split by dataset (cal vs. val)
+(p_hist_cal_val_lab <- tbl_regmat_target %>%
+    filter(BIS_type == "lab") %>% 
+    ggplot(aes(SOM_per, color = split)) +
+    geom_histogram(binwidth = 0.1, fill = "white") +
+    scale_y_continuous() +
+    scale_x_continuous(breaks = XY_BREAKS_CAL,
+                       limits = c(XY_MIN_CAL - 0.01 * XY_RANGE_CAL,
+                                  XY_MAX_CAL + 0.01 * XY_RANGE_CAL)) +
+    labs(x = TARGET_EXP, y = "Count (lab measurements)") + 
+    facet_wrap(~ split) +
+    scale_color_manual(values = c("black", "blue")) +
+    labs(col = "Split") +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          axis.title.x = element_blank(),
+          legend.position = "none"))
+```
+## Boxplots
+```{r plot cal and val data boxplots, echo = FALSE, warning = FALSE, message = FALSE}
+tbl_regmat_target <- tbl_regmat_target %>% 
+  filter(d_mid < 200) %>% 
+  mutate(d_gsm = cut(d_mid,
+                     breaks = c(0, 5, 15, 30, 60, 100, 200),
+                     labels = c("0-5", "5-15", "15-30", "30-60", "60-100", "100-200"),
+                     right = FALSE))
+# counts per split and depth increment
+tbl_calval_counts <- tbl_regmat_target %>% 
+    group_by(split, d_gsm) %>% 
+    mutate(count = n()) %>% 
+    distinct(count) %>% 
+    arrange(d_gsm, split) %>% 
+    mutate(n = as.character(as.expression(paste0("italic(n) == ", count))))
+# assign levels to depth increments so that they are in reverse order
+depth_order <- c("100-200", "60-100", "30-60", "15-30", "5-15", "0-5")
+tbl_regmat_target$d_gsm <- factor(x = tbl_regmat_target$d_gsm,
+                                 levels = depth_order)
+# boxplots split by dataset
+(p_boxplot_cal_val <- tbl_regmat_target %>%
+  ggplot(aes(x = pull(tbl_regmat_target, TARGET), y = d_gsm, color = split)) +
+  geom_boxplot(outlier.shape = 21) +
+  scale_color_manual(values = c("black", "blue")) +
+  facet_wrap(~ split) +
+  xlab(as.expression(paste(TARGET_EXP))) +
+  ylab(expression("Depth [cm]")) +
+  scale_x_continuous(breaks = XY_BREAKS_CAL,
+                     limits = c(XY_MIN_CAL - 0.01 * XY_RANGE_CAL,
+                                XY_MAX_CAL + 0.01 * XY_RANGE_CAL)) +
+  geom_text(data = tbl_calval_counts,
+            aes(x = Inf, y = -Inf, label = n),
+            size = 3,
+            hjust = rep(1.05, 12),
+            vjust = c(-13.25,  # cal 0-5
+                      -14,  # val 0-5
+                      -10.5,  # cal 5-15
+                      -11,  # val 5-15
+                      -8,  # cal 15-30
+                      -8,   # val 15-30
+                      -5.5,  # cal 30-60
+                      -5.5,  # val 30-60
+                      -2.75,  # cal 60-100
+                      -2.8,  # val 60-100
+                      -0.6, # cal 100-200
+                      -0.6),  # val 100-200
+            parse = TRUE) +
+  theme_bw() +
+  theme(strip.background = element_blank(),
+        strip.text = element_blank(),
+        legend.position = "none"))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_boxplots_cal_val_d.pdf"),
+#        plot = p_boxplot_cal_val,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 10, height = 5)
+```
+```{r combine plots using cowplot, echo = FALSE, warning = FALSE, message = FALSE}
+# combine descriptive plots using cowplot
+p_target_descriptive <- plot_grid(plot_grid(m_target_cal, m_target_val,
+                                            align = "hv", nrow = 1, ncol = 2),
+                                  p_hist_cal_val,
+                                  p_boxplot_cal_val,
+                                  nrow = 3, ncol = 1,
+                                  align = "v", axis = "l",
+                                  rel_heights = c(0.5, 0.25, 0.25),
+                                  # rel_heights = c(3, 1, 1),
+                                  rel_widths = c(1.2, 1, 1.2))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_descriptive.pdf"),
+#        plot = p_target_descriptive,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 8, height = 8)
+```
+## SOM density plots over depth
+```{r density plot over depth, echo = FALSE, warning = FALSE, message = FALSE}
+# plot density over depth of all observations (calibration and validation)
+(p_target_density_d <- ggplot() +
+    theme_bw() +
+    geom_hex(data = tbl_regmat_target %>% 
+                 filter(d_mid <= 200),
+             aes(x = SOM_per, y = d_mid), bins = 50) +
+    scale_fill_viridis(option = "inferno", direction = -1) +
+    scale_y_continuous(trans = "reverse") +
+    xlab("SOM [%]") +
+    ylab("Depth [cm]") +
+    labs(fill = "Number of observations"))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_density_depth.pdf"),
+#        plot = p_target_density_d,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 8, height = 5)
+# plot density over depth of all observations (calibration and validation)
+(p_target_density_d_lab <- ggplot() +
+    theme_bw() +
+    geom_hex(data = tbl_regmat_target %>% 
+               filter(BIS_type %in% "lab") %>%
+               filter(d_mid <= 200),
+             aes(x = SOM_per, y = d_mid), bins = 50) +
+    scale_fill_viridis(option = "inferno", direction = -1) +
+    scale_y_continuous(trans = "reverse") +
+    xlab("SOM [%]") +
+    ylab("Depth [cm]") +
+    labs(fill = "Number of lab measurements"))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_density_depth_lab.pdf"),
+#        plot = p_target_density_d_lab,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 8, height = 5)
+```
+## Year of measurement (age)
+```{r barchart of target measurement age, echo = FALSE, warning = FALSE, message = FALSE}
+# set order of datasets so we always show calibration first, then validation
+dataset_order <- c("Validation", "Calibration")
+tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
+                                 levels = dataset_order)
+# plot barchart of ages of soil observations
+(p_target_age <- tbl_regmat_target %>% 
+    ggplot(aes(year, fill = split)) +
+    geom_histogram(stat = "count") +
+    labs(x = "Year", y = "Count") + 
+    scale_fill_manual(values = c( "blue", "black")) +
+    scale_x_continuous(breaks = seq(1960, 2020, 10)) +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          legend.title = element_blank()))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_age.pdf"),
+#        plot = p_target_age,
+#        path = paste0("out/figs/explorative/", TARGET),
+#        width = 8, height = 4)
+# plot barchart of ages of soil observations colored by observation type
+(p_target_age_type <- tbl_regmat_target %>% 
+    mutate(BIS_type = case_when(BIS_type %in% "field" ~ "Field estimate",
+                                BIS_type %in% "lab" ~ "Lab measurement")) %>% 
+    ggplot(aes(year, fill = BIS_type)) +
+    geom_histogram(stat = "count") +
+    labs(x = "Year", y = "Number of SOM observations") + 
+    scale_fill_manual(values = c("#1b9e77", "#d95f02")) +
+    scale_x_continuous(breaks = seq(1960, 2020, 10)) +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          legend.title = element_blank()))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_age_type.pdf"),
+#        plot = p_target_age_type,
+#        path = paste0("out/figs/explorative/", TARGET),
+#        width = 8, height = 4)
+# same as above but log axis and without legend
+(p_target_age_type_log <- tbl_regmat_target %>% 
+    mutate(BIS_type = case_when(BIS_type %in% "field" ~ "Field estimate",
+                                BIS_type %in% "lab" ~ "Lab measurement")) %>% 
+    ggplot(aes(year, fill = BIS_type)) +
+    geom_histogram(stat = "count") +
+    labs(x = "Year", y = "Number of SOM observations") + 
+    scale_fill_manual(values = c("#1b9e77", "#d95f02")) +
+    scale_x_continuous(breaks = seq(1960, 2020, 10)) +
+    scale_y_log10() +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          legend.position = "none"))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_age_type_log.pdf"),
+#        plot = p_target_age_type_log,
+#        path = paste0("out/figs/explorative/", TARGET),
+#        width = 8, height = 4)
+# plot barchart of ages of lab measurements only
+(p_target_age_lab <- tbl_regmat_target %>% 
+    filter(BIS_type %in% "lab") %>% 
+    ggplot(aes(year, fill = split)) +
+    geom_histogram(stat = "count") +
+    labs(x = "Year", y = "Count") + 
+    scale_fill_manual(values = c( "blue", "black")) +
+    scale_x_continuous(breaks = seq(1960, 2020, 10)) +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          legend.title = element_blank()))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_age_lab.pdf"),
+#        plot = p_target_age_lab,
+#        path = paste0("out/figs/explorative/", TARGET),
+#        width = 8, height = 4)
+```
+```{r cowplot point map and age histogram, include = FALSE, echo = FALSE, warning = FALSE}
+# combine descriptive plots using cowplot
+p_target_point_hist <- plot_grid(m_target,
+                                 p_target_age_type_log,
+                                 nrow = 1, ncol = 2,
+                                 align = "v", axis = "l",
+                                 rel_widths = c(1, 1), labels = c("a)", "b)"))
+# save to disk (pdf)
+# ggsave(filename = paste0("p_", TARGET, "_spatial_point_age_type.pdf"),
+#        plot = p_target_point_hist,
+#        path = "out/figs/explorative/SOM_per",
+#        width = 8, height = 4)
+# save to disk (png)
+ggsave(filename = paste0("p_", TARGET, "_spatial_point_age_type.png"),
+       plot = p_target_point_hist,
+       path = "out/figs/explorative/SOM_per",
+       width = 8, height = 4)
+```
+## Maps of observations at calibration locations: top- & subsoil 
+```{r soil property point data, echo = FALSE, warning = FALSE}
+# Prepare sf object: upper-most sample of calibration dataset
+sf_regmat_target_cal_top <- tbl_regmat_target %>% 
+  filter(split %in% "Calibration") %>% 
+  group_by(X, Y) %>% 
+  slice(1L) %>% # slice by location to only get topsoil observations
+  ungroup %>% 
+  st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
+  st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
+# Prepare sf object: lower-most sample of calibration dataset
+sf_regmat_target_cal_sub <- tbl_regmat_target %>% 
+  filter(split %in% "Calibration") %>% 
+  group_by(X, Y) %>% 
+  slice(tail(row_number(), 1)) %>% # slice by lowest sample at each location
+  ungroup %>% 
+  st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
+  st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
+# gather number of locations for displaying on map
+n <- as.character(as.expression(paste0("italic(n) == ", nrow(sf_regmat_target_cal_top))))
+# map target variable sampling locations for topsoil values
+(m_target_locations_cal_top <- ggplot() +
+    theme_bw() +
+    geom_sf(data = sf_NL_borders, fill = "white") +
+    geom_sf(data = sf_regmat_target_cal_top,
+            aes(color = SOM_per), alpha = 0.5, shape = 19, size = 0.25) +
+    scale_color_cmocean(name = "solar", direction = -1) +
+    geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
+              hjust = 13, vjust = -55, parse = TRUE) +
+    theme(legend.position = c(0.1, 0.8),
+          panel.border = element_blank(),
+          panel.background = element_blank(),
+          panel.grid.major = element_line(colour = "transparent"),
+          axis.title.x = element_blank(),
+          axis.text.x = element_blank(),
+          axis.ticks.x = element_blank(),
+          axis.title.y = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.y = element_blank()) +
+    labs(col = "Topsoil SOM [%]") +
+    annotation_scale(location = "bl", width_hint = 0.5) +
+    annotation_north_arrow(location = "bl", which_north = "true", 
+                           pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                           style = north_arrow_fancy_orienteering,
+                           height = unit(1, "cm"), width = unit(1, "cm")))
+# ggsave(filename = "m_SOM_per_topsoil_cal_locations.pdf",
+#        m_target_locations_cal_top,
+#        path = "out/figs/explorative/SOM_per",
+#        height = 8, width = 8)
+# map target variable sampling locations for subsoil values
+(m_target_locations_cal_sub <- ggplot() +
+    theme_bw() +
+    geom_sf(data = sf_NL_borders, fill = "white") +
+    geom_sf(data = sf_regmat_target_cal_sub,
+            aes(color = SOM_per), alpha = 0.5, shape = 19, size = 0.25) +
+    scale_color_cmocean(name = "solar", direction = -1) +
+    geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
+              hjust = 13, vjust = -55, parse = TRUE) +
+    theme(legend.position = c(0.1, 0.8),
+          panel.border = element_blank(),
+          panel.background = element_blank(),
+          panel.grid.major = element_line(colour = "transparent"),
+          axis.title.x = element_blank(),
+          axis.text.x = element_blank(),
+          axis.ticks.x = element_blank(),
+          axis.title.y = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.y = element_blank()) +
+    labs(col = "Subsoil SOM [%]") +
+    annotation_scale(location = "bl", width_hint = 0.5) +
+    annotation_north_arrow(location = "bl", which_north = "true", 
+                           pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                           style = north_arrow_fancy_orienteering,
+                           height = unit(1, "cm"), width = unit(1, "cm")))
+# ggsave(filename = "m_SOM_per_subsoil_cal_locations.pdf",
+#        m_target_locations_cal_sub,
+#        path = "out/figs/explorative/SOM_per",
+#        height = 8, width = 8)
+```
+## Geostatistical exploratory analysis
+### Variogram
+```{r variogram, echo = TRUE, warning = FALSE}
+library(gstat)
+# calculate variogram for topsoil lab observations (exclude field estimates otherwise computation too slow)
+vgm_target_cal_top <- variogram(SOM_per ~ 1,
+                                filter(sf_regmat_target_cal_top, BIS_type %in% "lab"))
+# fit variogram to model
+vgm_fit_target_cal_top <- fit.variogram(vgm_target_cal_top, model = vgm("Exp"))
+# plot variogram model
+pdf(file = paste0("out/figs/explorative/", TARGET, "/", TARGET, "_lab_cal_variogram.pdf"),
+    width = 8, height = 4)
+plot(vgm_target_cal_top, vgm_fit_target_cal_top)
+dev.off()
+```
+### Nearest observations & distances
+```{r nearest distances, echo = TRUE, warning = FALSE}
+library(meteo)
+tbl_regmat_target_cal_lab <- tbl_regmat_target %>% 
+  filter(split %in% "Calibration") %>% 
+  filter(BIS_type %in% "lab")
+# if you want to know nearest distances to sampling locations, use nabor::knn
+# knn1 <- nabor::knn(tbl_regmat_target_cal[,c("X","Y")], tbl_regmat_target_cal[,c("X","Y")], k=2)
+# dist_all <- as.vector(knn1$nn.dists)
+# dist_all <- dist_all[dist_all!=0]
+# 
+# summary(dist_all)
+# but for 3D DSM, more applicable to also specify threshold depth to search in
+# use 20cm because this was optimal tuning result for both NL and Edgeroi datasets
+df_near_obs_3D <- near.obs.soil(locations = as.data.frame(tbl_regmat_target_cal_lab),
+                                locations.x.y.md = c("X", "Y", "d_mid"),
+                                observations = as.data.frame(tbl_regmat_target_cal_lab),
+                                observations.x.y.md = c("X", "Y", "d_mid", "SOM_per"),
+                                zcol = "SOM_per",
+                                n.obs = 1,
+                                depth.range = 20,
+                                no.obs = "increase")
+summary(df_near_obs_3D)
+```
--- a/35_model_data_expl_analysis_SOM.html
+++ b/35_model_data_expl_analysis_SOM.html
--- a/35_model_data_expl_analysis_pH_KCl.Rmd
+++ b/35_model_data_expl_analysis_pH_KCl.Rmd
+---
+title: "Exploratory Analysis of Modelling Data"
+subtitle: "Soil pH [KCl]"
+author: "Anatol Helfenstein"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: yes
+    toc_float: yes
+  '': default
+---
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+options(width = 100) # sets width of R code output (not images)
+```
+```{r load required pkgs and data, include = FALSE}
+# load packages
+pkgs <- c("tidyverse", "raster", "rgdal", "sf", "rasterVis", "viridis",
+          "foreach", "ggspatial", "cowplot")
+lapply(pkgs, library, character.only = TRUE)
+# 1) Specify DSM target soil property (response):
+TARGET = "pH_KCl"
+TARGET_EXP = "pH [KCl]"
+# 2) Read in regression matrix specific to target soil property
+tbl_regmat_target <- read_rds(paste0("out/data/model/", TARGET, "/tbl_regmat_",
+                                     TARGET, ".Rds"))
+# remane "train" and "test" to match other plots in paper and specify number
+tbl_regmat_target <- tbl_regmat_target %>% 
+  mutate(split = case_when(split %in% "train" ~ "Calibration",
+                           split %in% "test" ~ "Validation"))
+# set order of datasets so we always show calibration first, then validation
+dataset_order <- c("Calibration", "Validation")
+tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
+                                 levels = dataset_order)
+# convert to sf
+sf_regmat_target <- tbl_regmat_target %>% 
+  st_as_sf(., coords = c("X", "Y"), crs = "EPSG:28992")
+# 3) Set plotting axis min, max, range and breaks
+XY_MIN = min(tbl_regmat_target[TARGET])
+XY_MAX = max(tbl_regmat_target[TARGET])
+XY_RANGE = diff(range(XY_MIN, XY_MAX))
+XY_BREAKS = unique(round(seq(XY_MIN, XY_MAX, XY_RANGE/10)))
+# 4) Read in NL border shapefile for mapping
+sf_NL_borders <- st_read("data/other/NL_borders.shp")
+```
+## Maps
+```{r map cal and val data locations, echo = FALSE, warning = FALSE, message = FALSE}
+# number of calibration locations
+n_cal_sites = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Calibration") %>%
+    group_by(X,Y) %>%
+    tally() %>%
+    nrow())))
+# number of validation locations
+n_val_sites = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Validation") %>%
+    group_by(X,Y) %>%
+    tally() %>%
+    nrow())))
+# plot calibration locations
+(m_target_cal <- ggplot() +
+  theme_bw() +
+  geom_sf(data = sf_NL_borders) +
+  geom_sf(data = sf_regmat_target %>% 
+            filter(split %in% "Calibration"),
+          color = "black", shape = 21, size = 0.5) +
+  geom_text(aes(x = Inf, y = -Inf, label = n_cal_sites),
+            size = 3, hjust = 6, vjust = -32, parse = TRUE) +
+  ggtitle("Calibration (PFB)") +
+  theme(legend.position = c(0.1, 0.8),
+        plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
+        panel.border = element_blank(),
+        panel.background = element_blank(),
+        panel.grid.major = element_line(colour = "transparent"),
+        axis.title.x = element_blank(),
+        axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank()) +
+  annotation_scale(location = "bl", width_hint = 0.5) +
+  annotation_north_arrow(location = "bl", which_north = "true", 
+                         pad_x = unit(0.05, "in"), pad_y = unit(0.25, "in"),
+                         style = north_arrow_fancy_orienteering,
+                         height = unit(1, "cm"), width = unit(1, "cm")))
+# plot validation locations
+(m_target_val <- ggplot() +
+  theme_bw() +
+  geom_sf(data = sf_NL_borders) +
+  geom_sf(data = sf_regmat_target %>% 
+            filter(split %in% "Validation"),
+          color = "blue", shape = 25, size = 0.5) +
+  geom_text(aes(x = Inf, y = -Inf, label = n_val_sites),
+            size = 3, hjust = 6, vjust = -32, parse = TRUE) +
+  ggtitle("Validation (LSK)") +
+  theme(legend.position = c(0.1, 0.8),
+        plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 10),
+        panel.border = element_blank(),
+        panel.background = element_blank(),
+        panel.grid.major = element_line(colour = "transparent"),
+        axis.title.x = element_blank(),
+        axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank()))
+```
+## Histograms
+```{r plot cal and val data histograms, echo = FALSE, warning = FALSE, message = FALSE}
+# number of calibration locations
+n_cal_samples = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Calibration") %>%
+    nrow())))
+# number of validation locations
+n_val_samples = as.character(as.expression(paste0(
+  "italic(n) == ",
+  tbl_regmat_target %>%
+    filter(split %in% "Validation") %>%
+    nrow())))
+tbl_n_samples <- tibble(split = c("Calibration", "Validation"),
+                        n_samples = c(n_cal_samples, n_val_samples))
+# histogram of all data split by dataset
+(p_hist_cal_val <- tbl_regmat_target %>%
+    ggplot(aes(pH_KCl, color = split)) +
+    geom_histogram(binwidth = 0.1, fill = "white") +
+    scale_y_continuous() +
+    scale_x_continuous(breaks = XY_BREAKS,
+                       limits = c(XY_MIN - 0.01 * XY_RANGE,
+                                  XY_MAX + 0.01 * XY_RANGE)) +
+    labs(x = TARGET_EXP, y = "Count") + 
+    facet_wrap(~ split) +
+    scale_color_manual(values = c("black", "blue")) +
+    labs(col = "Split") +
+    geom_text(data = tbl_n_samples,
+              aes(x = Inf, y = -Inf, label = n_samples),
+              size = 3, hjust = c(5.8, 6.6), vjust = c(-13, -13), parse = TRUE) +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          axis.title.x = element_blank(),
+          legend.position = "none"))
+```
+## Boxplots
+```{r plot cal and val data boxplots, echo = FALSE, warning = FALSE, message = FALSE}
+tbl_regmat_target <- tbl_regmat_target %>% 
+  filter(d_mid < 200) %>% 
+  mutate(d_gsm = cut(d_mid,
+                     breaks = c(0, 5, 15, 30, 60, 100, 200),
+                     labels = c("0-5", "5-15", "15-30", "30-60", "60-100", "100-200"),
+                     right = FALSE))
+# counts per split and depth increment
+tbl_calval_counts <- tbl_regmat_target %>% 
+    group_by(split, d_gsm) %>% 
+    mutate(count = n()) %>% 
+    distinct(count) %>% 
+    arrange(d_gsm, split) %>% 
+    mutate(n = as.character(as.expression(paste0("italic(n) == ", count))))
+# assign levels to depth increments so that they are in reverse order
+depth_order <- c("100-200", "60-100", "30-60", "15-30", "5-15", "0-5")
+tbl_regmat_target$d_gsm <- factor(x = tbl_regmat_target$d_gsm,
+                                 levels = depth_order)
+# boxplots split by dataset
+(p_boxplot_cal_val <- tbl_regmat_target %>%
+  ggplot(aes(x = pull(tbl_regmat_target, TARGET), y = d_gsm, color = split)) +
+  geom_boxplot(outlier.shape = 21) +
+  scale_color_manual(values = c("black", "blue")) +
+  facet_wrap(~ split) +
+  xlab(as.expression(paste(TARGET_EXP))) +
+  ylab(expression("Depth [cm]")) +
+  scale_x_continuous(breaks = XY_BREAKS,
+                     limits = c(XY_MIN - 0.01 * XY_RANGE,
+                                XY_MAX + 0.01 * XY_RANGE)) +
+  geom_text(data = tbl_calval_counts,
+            aes(x = Inf, y = -Inf, label = n),
+            size = 3,
+            hjust = rep(1.05, 12),
+            vjust = c(-13.25,  # cal 0-5
+                      -14,  # val 0-5
+                      -10.5,  # cal 5-15
+                      -11,  # val 5-15
+                      -8,  # cal 15-30
+                      -8,   # val 15-30
+                      -5.5,  # cal 30-60
+                      -5.5,  # val 30-60
+                      -2.75,  # cal 60-100
+                      -2.8,  # val 60-100
+                      -0.6, # cal 100-200
+                      -0.6),  # val 100-200
+            parse = TRUE) +
+  theme_bw() +
+  theme(strip.background = element_blank(),
+        strip.text = element_blank(),
+        legend.position = "none"))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_boxplots_cal_val_d.pdf"),
+#        plot = p_boxplot_cal_val,
+#        path = paste0("out/figs/explorative/", TARGET),
+#        width = 10, height = 5)
+```
+```{r combine plots using cowplot, echo = FALSE, warning = FALSE, message = FALSE}
+# combine descriptive plots using cowplot
+p_target_descriptive <- plot_grid(plot_grid(m_target_cal, m_target_val,
+                                            align = "hv", nrow = 1, ncol = 2),
+                                  p_hist_cal_val,
+                                  p_boxplot_cal_val,
+                                  nrow = 3, ncol = 1,
+                                  align = "v", axis = "l",
+                                  rel_heights = c(0.5, 0.25, 0.25),
+                                  # rel_heights = c(3, 1, 1),
+                                  rel_widths = c(1.2, 1, 1.2))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_descriptive.pdf"),
+#        plot = p_target_descriptive,
+#        path = paste0("out/figs/explorative/", TARGET),
+#        width = 8, height = 8)
+```
+## Year of measurement (age)
+```{r barchart of target measurement age, echo = FALSE, warning = FALSE, message = FALSE}
+# set order of datasets so we always show calibration first, then validation
+dataset_order <- c("Validation", "Calibration")
+tbl_regmat_target$split <- factor(x = tbl_regmat_target$split,
+                                 levels = dataset_order)
+# plot barchart of ages of soil measurements
+(p_target_age <- tbl_regmat_target %>% 
+    ggplot(aes(year, fill = split)) +
+    geom_histogram(stat = "count") +
+    labs(x = "Year", y = "Count") + 
+    scale_fill_manual(values = c( "blue", "black")) +
+    scale_x_continuous(breaks = seq(1960, 2010, 10)) +
+    theme_bw() +
+    theme(strip.background = element_blank(),
+          strip.text = element_blank(),
+          legend.title = element_blank()))
+# save to disk
+# ggsave(filename = paste0("p_", TARGET, "_age.pdf"),
+#        plot = p_target_age,
+#        path = paste0("out/figs/explorative/", TARGET),
+#        width = 8, height = 4)
+```
+## Maps of observations at calibration locations: top- & subsoil 
+```{r soil property point data, echo = FALSE, warning = FALSE}
+# Prepare sf object: upper-most sample of calibration dataset
+sf_regmat_target_cal_top <- tbl_regmat_target %>% 
+  filter(split %in% "Calibration") %>% 
+  group_by(X, Y) %>% 
+  slice(1L) %>% # slice by location to only get topsoil observations
+  ungroup %>% 
+  st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
+  st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
+# Prepare sf object: lower-most sample of calibration dataset
+sf_regmat_target_cal_sub <- tbl_regmat_target %>% 
+  filter(split %in% "Calibration") %>% 
+  group_by(X, Y) %>% 
+  slice(tail(row_number(), 1)) %>% # slice by lowest sample at each location
+  ungroup %>% 
+  st_as_sf(., coords = c("X", "Y")) %>% # convert to spatial (sf)
+  st_set_crs(., "EPSG:28992") # set coordinate reference system of Netherlands
+# gather number of locations for displaying on map
+n <- as.character(as.expression(paste0("italic(n) == ", nrow(sf_regmat_target_cal_top))))
+# define range in order to use identical color scheme for top- and subsoil
+min <- if (min(sf_regmat_target_cal_top$pH_KCl) < min(sf_regmat_target_cal_sub$pH_KCl)) {
+  min(sf_regmat_target_cal_top$pH_KCl)} else {min(sf_regmat_target_cal_sub$pH_KCl)}
+max <- if (max(sf_regmat_target_cal_top$pH_KCl) > max(sf_regmat_target_cal_sub$pH_KCl)) {
+  max(sf_regmat_target_cal_top$pH_KCl)} else {max(sf_regmat_target_cal_sub$pH_KCl)}
+# map target variable sampling locations for topsoil values
+(m_target_locations_cal_top <- ggplot() +
+  theme_bw() +
+  geom_sf(data = sf_NL_borders) +
+  geom_sf(data = sf_regmat_target_cal_top, aes(color = pH_KCl)) +
+  scale_fill_viridis_c(aesthetics = "color", option = "inferno",
+                       limits = c(min, max)) + # or plasma
+  geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
+            hjust = 13, vjust = -55, parse = TRUE) +
+  theme(legend.position = c(0.1, 0.8),
+        panel.border = element_blank(),
+        panel.background = element_blank(),
+        axis.title.x = element_blank(),
+        axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank()) +
+  labs(col = "Topsoil pH [KCl]") +
+  annotation_scale(location = "bl", width_hint = 0.5) +
+  annotation_north_arrow(location = "bl", which_north = "true", 
+                         pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"),
+                         style = north_arrow_fancy_orienteering))
+# ggsave("out/maps/explorative/pH_KCl/m_pH_KCl_topsoil_locations.pdf",
+#        m_target_locations_top,
+#         height = 8,
+#         width = 8)
+# map target variable sampling locations for subsoil values
+(m_target_locations_cal_sub <- ggplot() +
+  theme_bw() +
+  geom_sf(data = sf_NL_borders) +
+  geom_sf(data = sf_regmat_target_cal_sub, aes(color = pH_KCl)) +
+  scale_fill_viridis_c(aesthetics = "color", option = "inferno",
+                       limits = c(min, max)) + # or plasma
+  geom_text(aes(x = Inf, y = -Inf, label = n), size = 3,
+            hjust = 13, vjust = -55, parse = TRUE) +
+  theme(legend.position = c(0.1, 0.8),
+        panel.border = element_blank(),
+        panel.background = element_blank(),
+        axis.title.x = element_blank(),
+        axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank()) +
+  labs(col = "Subsoil pH [KCl]") +
+  annotation_scale(location = "bl", width_hint = 0.5) +
+  annotation_north_arrow(location = "bl", which_north = "true", 
+                         pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"),
+                         style = north_arrow_fancy_orienteering))
+# ggsave("out/maps/explorative/pH_KCl/m_pH_KCl_subsoil_locations.pdf",
+#        m_target_locations_sub,
+#        height = 8,
+#        width = 8)
+```
+## Geostatistical exploratory analysis
+### Variogram
+```{r variogram, echo = TRUE, warning = FALSE}
+library(gstat)
+# calculate variogram for topsoil observations
+vgm_target_cal_top <- variogram(pH_KCl ~ 1, sf_regmat_target_cal_top)
+# fit variogram to model
+vgm_fit_target_cal_top <- fit.variogram(vgm_target_cal_top, model = vgm("Exp"))
+# plot variogram model
+# pdf(file = paste0("out/figs/explorative/", TARGET, "/", TARGET, "_cal_variogram.pdf"), width = 8, height = 4)
+plot(vgm_target_cal_top, vgm_fit_target_cal_top)
+# dev.off()
+```
+### Nearest observations & distances
+```{r nearest distances, echo = TRUE, warning = FALSE}
+library(meteo)
+tbl_regmat_target_cal <- filter(tbl_regmat_target, split %in% "Calibration")
+# if you want to know nearest distances to sampling locations, use nabor::knn
+# knn1 <- nabor::knn(tbl_regmat_target_cal[,c("X","Y")], tbl_regmat_target_cal[,c("X","Y")], k=2)
+# dist_all <- as.vector(knn1$nn.dists)
+# dist_all <- dist_all[dist_all!=0]
+# 
+# summary(dist_all)
+# but for 3D DSM, more applicable to also specify threshold depth to search in
+# use 20cm because this was optimal tuning result for both NL and Edgeroi datasets
+df_near_obs_3D <- near.obs.soil(locations = as.data.frame(tbl_regmat_target_cal),
+                                locations.x.y.md = c("X", "Y", "d_mid"),
+                                observations = as.data.frame(tbl_regmat_target_cal),
+                                observations.x.y.md = c("X", "Y", "d_mid", "pH_KCl"),
+                                zcol = "pH_KCl",
+                                n.obs = 1,
+                                depth.range = 20,
+                                no.obs = "increase")
+summary(df_near_obs_3D)
+```
--- a/35_model_data_expl_analysis_pH_KCl.html
+++ b/35_model_data_expl_analysis_pH_KCl.html