From ee45cf20c338dd39971edade33758dc48cacfb8f Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Sat, 24 Feb 2024 09:37:32 -0800 Subject: [PATCH 01/22] Add `collapseHz()` --- NAMESPACE | 1 + R/collapseHz.R | 63 +++++++++++++++++++++++++++++++++++++++++++++++ man/collapseHz.Rd | 37 ++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 R/collapseHz.R create mode 100644 man/collapseHz.Rd diff --git a/NAMESPACE b/NAMESPACE index 8e148c846..e9f8563ef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ export(buntley.westin.index) export(checkHzDepthLogic) export(checkSPC) export(col2Munsell) +export(collapseHz) export(colorChart) export(colorContrast) export(colorContrastPlot) diff --git a/R/collapseHz.R b/R/collapseHz.R new file mode 100644 index 000000000..3c8d184f8 --- /dev/null +++ b/R/collapseHz.R @@ -0,0 +1,63 @@ +#' Collapse Horizons within Profiles Based on Pattern Matching +#' +#' Combines and aggregates layers by grouping adjacent horizons that match `pattern` in `hzdesgn`. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group. +#' +#' @param x A _SoilProfileCollection_ +#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column +#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default is estimated using `guessHzDesgnName()`. +#' @param ignore.case _logical_. If `FALSE`, the pattern matching is case sensitive and if `TRUE`, case is ignored during matching. Default: `FALSE` +#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for each group and in weighted averages. If `FALSE` `NA` values are propagated to the result. Default: `FALSE` +#' +#' @return A _SoilProfileCollection_ +#' @export +#' +#' @examples +#' data(jacobs2000) +#' +#' a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar")) +#' b <- jacobs2000 +#' profile_id(a) <- paste0(profile_id(a), "_collapse") +#' +#' plot(c(a, b), color = "clay") +collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE), ignore.case = FALSE, na.rm = FALSE) { + idn <- idname(x) + hzd <- horizonDepths(x) + if (!is.null(names(pattern))) { + labels <- names(pattern) + pattern <- as.character(pattern) + } else { + pattern <- as.character(pattern) + labels <- pattern + } + for (p in seq(pattern)) { + h <- data.table::data.table(horizons(x)) + l <- grepl(pattern[p], h[[hzdesgn]], ignore.case = ignore.case) + if (any(l)) { + r <- rle(l) + g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i]))) + res <- h[g %in% unique(g[l]), c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), + hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)), + lapply(.SD, \(x, top, bottom) { + if (is.numeric(x)) { + weighted.mean(x, bottom - top, na.rm = na.rm) + } else { + x[which.max(bottom - top)[1]] + } + }, .SD[[hzd[1]]], .SD[[hzd[2]]])), + by = g[g %in% unique(g[l])]] + res$g <- NULL + res[[hzdesgn]] <- labels[p] + h <- h[-which(g %in% unique(g[l])),] + h <- data.table::rbindlist(list(h, res), fill = TRUE) + h <- h[order(h[[idn]], h[[hzd[1]]]),] + hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew) + h[[hzd[1]]][hn] <- h$hzdeptnew[hn] + h[[hzd[2]]][hn] <- h$hzdepbnew[hn] + h$hzdeptnew <- NULL + h$hzdepbnew <- NULL + replaceHorizons(x) <- h + } + } + x +} + diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd new file mode 100644 index 000000000..1f461a1ca --- /dev/null +++ b/man/collapseHz.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/collapseHz.R +\name{collapseHz} +\alias{collapseHz} +\title{Collapse Horizons within Profiles Based on Pattern Matching} +\usage{ +collapseHz( + x, + pattern, + hzdesgn = guessHzDesgnName(x, required = TRUE), + ignore.case = FALSE, + na.rm = FALSE +) +} +\arguments{ +\item{x}{A SoilProfileCollection} + +\item{pattern}{character. A regular expression pattern to match in \code{hzdesgn} column} + +\item{hzdesgn}{character. Any character column containing horizon-level identifiers. Default is estimated using \code{guessHzDesgnName()}.} + +\item{ignore.case}{logical. If \code{FALSE}, the pattern matching is case sensitive and if \code{TRUE}, case is ignored during matching. Default: \code{FALSE}} + +\item{na.rm}{logical. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} values are propagated to the result. Default: \code{FALSE}} +} +\description{ +Combines and aggregates layers by grouping adjacent horizons that match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group. +} +\examples{ +data(jacobs2000) + +a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar")) +b <- jacobs2000 +profile_id(a) <- paste0(profile_id(a), "_collapse") + +plot(c(a, b), color = "clay") +} From a5c22e32b0caeaa0c8d0d0f2a498cfa51ce4f527 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Sat, 24 Feb 2024 09:41:45 -0800 Subject: [PATCH 02/22] Add test --- tests/testthat/test-collapseHz.R | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tests/testthat/test-collapseHz.R diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R new file mode 100644 index 000000000..33d598364 --- /dev/null +++ b/tests/testthat/test-collapseHz.R @@ -0,0 +1,10 @@ +test_that("collapseHz works", { + data("jacobs2000", package = "aqp") + x <- collapseHz(jacobs2000, c(`A` = "^A", + `E` = "E", + `Bt` = "[ABC]+t", + `C` = "^C", + `foo` = "bar")) + expect_equal(length(jacobs2000), length(x)) + expect_equal(nrow(x), 29) +}) From 71de71c52271613745acb49e54477ac872c1f535 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Sat, 24 Feb 2024 10:00:28 -0800 Subject: [PATCH 03/22] fun --- R/collapseHz.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 3c8d184f8..1d4c45eeb 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -37,7 +37,7 @@ collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i]))) res <- h[g %in% unique(g[l]), c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)), - lapply(.SD, \(x, top, bottom) { + lapply(.SD, function(x, top, bottom) { if (is.numeric(x)) { weighted.mean(x, bottom - top, na.rm = na.rm) } else { From 7b309759838a2529589655a3d17fc06a96940429 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Thu, 10 Oct 2024 16:06:07 -0700 Subject: [PATCH 04/22] implement custom matching function argument `FUN` --- R/collapseHz.R | 41 +++++++++++++++++++++++++++++++++-------- man/collapseHz.Rd | 43 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 17 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 1d4c45eeb..43c8f58a4 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -1,12 +1,28 @@ #' Collapse Horizons within Profiles Based on Pattern Matching #' -#' Combines and aggregates layers by grouping adjacent horizons that match `pattern` in `hzdesgn`. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group. +#' Combines and aggregates data for layers by grouping adjacent horizons that +#' match `pattern` in `hzdesgn`. Numeric properties are combined using the +#' weighted average, and other properties are derived from the thickest horizon +#' in each group. #' #' @param x A _SoilProfileCollection_ -#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column -#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default is estimated using `guessHzDesgnName()`. -#' @param ignore.case _logical_. If `FALSE`, the pattern matching is case sensitive and if `TRUE`, case is ignored during matching. Default: `FALSE` -#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for each group and in weighted averages. If `FALSE` `NA` values are propagated to the result. Default: `FALSE` +#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` +#' column +#' @param hzdesgn _character_. Any character column containing horizon-level +#' identifiers. Default is estimated using `guessHzDesgnName()`. +#' @param FUN _function_. A function that returns a _logical_ vector equal in +#' length to the number of horizons in `x`. See details. +#' @param ... Additional arguments passed to the matching function `FUN`. +#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating +#' min/max boundaries for each group and in weighted averages. If `FALSE` `NA` +#' values are propagated to the result. Default: `FALSE` +#' +#' @details +#' +#' If a custom function (`FUN`) is used, it should accept arbitrary additional +#' arguments via an ellipsis (`...`). It is not necessary to do anything with +#' arguments, but the result should match the number of horizons found in the +#' input SoilProfileCollection `x`. #' #' @return A _SoilProfileCollection_ #' @export @@ -14,12 +30,21 @@ #' @examples #' data(jacobs2000) #' -#' a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar")) +#' a <- collapseHz(jacobs2000, c(`A` = "^A", +#' `E` = "E", +#' `Bt` = "[ABC]+t", +#' `C` = "^C", +#' `foo` = "bar")) #' b <- jacobs2000 #' profile_id(a) <- paste0(profile_id(a), "_collapse") #' #' plot(c(a, b), color = "clay") -collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE), ignore.case = FALSE, na.rm = FALSE) { +collapseHz <- function(x, + pattern, + hzdesgn = hzdesgnname(x, required = TRUE), + FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), + ..., + na.rm = FALSE) { idn <- idname(x) hzd <- horizonDepths(x) if (!is.null(names(pattern))) { @@ -31,7 +56,7 @@ collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE } for (p in seq(pattern)) { h <- data.table::data.table(horizons(x)) - l <- grepl(pattern[p], h[[hzdesgn]], ignore.case = ignore.case) + l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...) if (any(l)) { r <- rle(l) g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i]))) diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd index 1f461a1ca..e03ed1c8a 100644 --- a/man/collapseHz.Rd +++ b/man/collapseHz.Rd @@ -7,29 +7,54 @@ collapseHz( x, pattern, - hzdesgn = guessHzDesgnName(x, required = TRUE), - ignore.case = FALSE, + hzdesgn = hzdesgnname(x, required = TRUE), + FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = + FALSE), + ..., na.rm = FALSE ) } \arguments{ -\item{x}{A SoilProfileCollection} +\item{x}{A \emph{SoilProfileCollection}} -\item{pattern}{character. A regular expression pattern to match in \code{hzdesgn} column} +\item{pattern}{\emph{character}. A regular expression pattern to match in \code{hzdesgn} +column} -\item{hzdesgn}{character. Any character column containing horizon-level identifiers. Default is estimated using \code{guessHzDesgnName()}.} +\item{hzdesgn}{\emph{character}. Any character column containing horizon-level +identifiers. Default is estimated using \code{guessHzDesgnName()}.} -\item{ignore.case}{logical. If \code{FALSE}, the pattern matching is case sensitive and if \code{TRUE}, case is ignored during matching. Default: \code{FALSE}} +\item{FUN}{\emph{function}. A function that returns a \emph{logical} vector equal in +length to the number of horizons in \code{x}. See details.} -\item{na.rm}{logical. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} values are propagated to the result. Default: \code{FALSE}} +\item{...}{Additional arguments passed to the matching function \code{FUN}.} + +\item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating +min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} +values are propagated to the result. Default: \code{FALSE}} +} +\value{ +A \emph{SoilProfileCollection} } \description{ -Combines and aggregates layers by grouping adjacent horizons that match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group. +Combines and aggregates data for layers by grouping adjacent horizons that +match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the +weighted average, and other properties are derived from the thickest horizon +in each group. +} +\details{ +If a custom function (\code{FUN}) is used, it should accept arbitrary additional +arguments via an ellipsis (\code{...}). It is not necessary to do anything with +arguments, but the result should match the number of horizons found in the +input SoilProfileCollection \code{x}. } \examples{ data(jacobs2000) -a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar")) +a <- collapseHz(jacobs2000, c(`A` = "^A", + `E` = "E", + `Bt` = "[ABC]+t", + `C` = "^C", + `foo` = "bar")) b <- jacobs2000 profile_id(a) <- paste0(profile_id(a), "_collapse") From 494f917822f4d5ef8cbc596478d9b3dac0b13507 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Thu, 10 Oct 2024 16:55:34 -0700 Subject: [PATCH 05/22] update aggregation methods - implement ignoring specific numeric columns with `ignore_numerics - add argument`AGGFUN` for column name specific aggregations - default for categories is to returndominant condition rather than just thickest layer --- R/collapseHz.R | 70 ++++++++++++++++++++++++++++++++++++++++------- man/collapseHz.Rd | 34 +++++++++++++++++++++++ 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 43c8f58a4..538121938 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -13,6 +13,16 @@ #' @param FUN _function_. A function that returns a _logical_ vector equal in #' length to the number of horizons in `x`. See details. #' @param ... Additional arguments passed to the matching function `FUN`. +#' @param AGGFUN _list_. A named list containing custom aggregation functions. +#' List element names should match the column name that they transform. The +#' functions defined should take three arguments: `x` (a vector of horizon +#' property values), `top` (a vector of top depths), and `bottom` (a vector of +#' bottom depths). Default: `NULL` applies weighted.mean() to all numeric +#' columns not listed in `ignore_numerics` and takes the thickest value for all +#' other columns. +#' @param ignore_numerics _character_. Vector of column names that contain numeric +#' values which should _not_ be aggregated using `weighted.mean()`. For example, +#' soil color "value" and "chroma". #' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating #' min/max boundaries for each group and in weighted averages. If `FALSE` `NA` #' values are propagated to the result. Default: `FALSE` @@ -39,11 +49,33 @@ #' profile_id(a) <- paste0(profile_id(a), "_collapse") #' #' plot(c(a, b), color = "clay") +#' +#' # custom aggregation function for matrix_color_munsell +#' +#' a2 <- collapseHz(jacobs2000, c(`A` = "^A", +#' `E` = "E", +#' `Bt` = "[ABC]+t", +#' `C` = "^C", +#' `foo` = "bar"), +#' AGGFUN = list(matrix_color_munsell = function(x, top, bottom) { +#' thk <- bottom - top +#' if (length(x) > 1) { +#' xord <- order(thk, decreasing = TRUE) +#' paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ") +#' } else x +#' }) +#' ) +#' profile_id(a2) <- paste0(profile_id(a), "_collapse_custom") +#' +#' unique(a2$matrix_color_munsell) +#' collapseHz <- function(x, pattern, hzdesgn = hzdesgnname(x, required = TRUE), FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), ..., + AGGFUN = NULL, + ignore_numerics = NULL, na.rm = FALSE) { idn <- idname(x) hzd <- horizonDepths(x) @@ -60,16 +92,34 @@ collapseHz <- function(x, if (any(l)) { r <- rle(l) g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i]))) - res <- h[g %in% unique(g[l]), c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), - hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)), - lapply(.SD, function(x, top, bottom) { - if (is.numeric(x)) { - weighted.mean(x, bottom - top, na.rm = na.rm) - } else { - x[which.max(bottom - top)[1]] - } - }, .SD[[hzd[1]]], .SD[[hzd[2]]])), - by = g[g %in% unique(g[l])]] + gidx <- g %in% unique(g[l]) + res <- h[gidx, c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), + hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)), + sapply(colnames(.SD)[!colnames(.SD) %in% hzd], + function(n, top, bottom) { + v <- .SD[[n]] + if (n %in% names(AGGFUN)) { + + # custom aggregation function (column name specific) + AGGFUN[[n]](v, top, bottom) + + } else if (!n %in% ignore_numerics && is.numeric(x)) { + + # weighted average by thickness (numerics not in exclusion list) + weighted.mean(v, bottom - top, na.rm = na.rm) + + } else { + # take thickest value + # v[which.max(bottom - top)[1]] + + # take dominant condition (based on sum of thickness) + cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm) + cond[[1]][which.max(cond[[2]])[1]] + } + }, + top = .SD[[hzd[1]]], + bottom = .SD[[hzd[2]]])), + by = g[gidx]] res$g <- NULL res[[hzdesgn]] <- labels[p] h <- h[-which(g %in% unique(g[l])),] diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd index e03ed1c8a..839a3d77a 100644 --- a/man/collapseHz.Rd +++ b/man/collapseHz.Rd @@ -11,6 +11,8 @@ collapseHz( FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), ..., + AGGFUN = NULL, + ignore_numerics = NULL, na.rm = FALSE ) } @@ -28,6 +30,18 @@ length to the number of horizons in \code{x}. See details.} \item{...}{Additional arguments passed to the matching function \code{FUN}.} +\item{AGGFUN}{\emph{list}. A named list containing custom aggregation functions. +List element names should match the column name that they transform. The +functions defined should take three arguments: \code{x} (a vector of horizon +property values), \code{top} (a vector of top depths), and \code{bottom} (a vector of +bottom depths). Default: \code{NULL} applies weighted.mean() to all numeric +columns not listed in \code{ignore_numerics} and takes the thickest value for all +other columns.} + +\item{ignore_numerics}{\emph{character}. Vector of column names that contain numeric +values which should \emph{not} be aggregated using \code{weighted.mean()}. For example, +soil color "value" and "chroma".} + \item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} values are propagated to the result. Default: \code{FALSE}} @@ -59,4 +73,24 @@ b <- jacobs2000 profile_id(a) <- paste0(profile_id(a), "_collapse") plot(c(a, b), color = "clay") + +# custom aggregation function for matrix_color_munsell + +a2 <- collapseHz(jacobs2000, c(`A` = "^A", + `E` = "E", + `Bt` = "[ABC]+t", + `C` = "^C", + `foo` = "bar"), + AGGFUN = list(matrix_color_munsell = function(x, top, bottom) { + thk <- bottom - top + if (length(x) > 1) { + xord <- order(thk, decreasing = TRUE) + paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ") + } else x + }) + ) +profile_id(a2) <- paste0(profile_id(a), "_collapse_custom") + +unique(a2$matrix_color_munsell) + } From 5339a61e024b057b55f4eca9a1da82c13808460f Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Thu, 10 Oct 2024 19:16:55 -0700 Subject: [PATCH 06/22] update for using existing `GHL()`, refine logic for multiple matches per profile, add example --- R/collapseHz.R | 170 +++++++++++++++++++++++++++++----------------- man/collapseHz.Rd | 80 ++++++++++++++-------- 2 files changed, 158 insertions(+), 92 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 538121938..a41c0d4a7 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -1,76 +1,97 @@ #' Collapse Horizons within Profiles Based on Pattern Matching #' #' Combines and aggregates data for layers by grouping adjacent horizons that -#' match `pattern` in `hzdesgn`. Numeric properties are combined using the +#' match `pattern` in `hzdesgn`. Numeric properties are combined using the #' weighted average, and other properties are derived from the thickest horizon #' in each group. #' #' @param x A _SoilProfileCollection_ -#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` -#' column +#' @param pattern _character_. A regular expression pattern to match in +#' `hzdesgn` column. Default #' @param hzdesgn _character_. Any character column containing horizon-level #' identifiers. Default is estimated using `guessHzDesgnName()`. #' @param FUN _function_. A function that returns a _logical_ vector equal in #' length to the number of horizons in `x`. See details. #' @param ... Additional arguments passed to the matching function `FUN`. -#' @param AGGFUN _list_. A named list containing custom aggregation functions. -#' List element names should match the column name that they transform. The -#' functions defined should take three arguments: `x` (a vector of horizon -#' property values), `top` (a vector of top depths), and `bottom` (a vector of +#' @param AGGFUN _list_. A named list containing custom aggregation functions. +#' List element names should match the column name that they transform. The +#' functions defined should take three arguments: `x` (a vector of horizon +#' property values), `top` (a vector of top depths), and `bottom` (a vector of #' bottom depths). Default: `NULL` applies weighted.mean() to all numeric #' columns not listed in `ignore_numerics` and takes the thickest value for all #' other columns. -#' @param ignore_numerics _character_. Vector of column names that contain numeric -#' values which should _not_ be aggregated using `weighted.mean()`. For example, -#' soil color "value" and "chroma". +#' @param ignore_numerics _character_. Vector of column names that contain +#' numeric values which should _not_ be aggregated using `weighted.mean()`. +#' For example, soil color "value" and "chroma". #' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating #' min/max boundaries for each group and in weighted averages. If `FALSE` `NA` #' values are propagated to the result. Default: `FALSE` -#' +#' #' @details -#' +#' #' If a custom function (`FUN`) is used, it should accept arbitrary additional #' arguments via an ellipsis (`...`). It is not necessary to do anything with #' arguments, but the result should match the number of horizons found in the #' input SoilProfileCollection `x`. -#' +#' #' @return A _SoilProfileCollection_ #' @export #' #' @examples #' data(jacobs2000) #' -#' a <- collapseHz(jacobs2000, c(`A` = "^A", -#' `E` = "E", -#' `Bt` = "[ABC]+t", -#' `C` = "^C", -#' `foo` = "bar")) -#' b <- jacobs2000 -#' profile_id(a) <- paste0(profile_id(a), "_collapse") -#' -#' plot(c(a, b), color = "clay") +#' # use existing generalized horizon labels +#' new_labels <- c("A", "E", "Bt", "Bh", "C") +#' patterns <- c("A", "E", "B.*t", "B.*h", "C") +#' +#' # calculate a new SPC with genhz column based on patterns +#' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) #' -#' # custom aggregation function for matrix_color_munsell +#' # collapse that SPC based on genhz +#' i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") #' -#' a2 <- collapseHz(jacobs2000, c(`A` = "^A", -#' `E` = "E", -#' `Bt` = "[ABC]+t", -#' `C` = "^C", -#' `foo` = "bar"), -#' AGGFUN = list(matrix_color_munsell = function(x, top, bottom) { -#' thk <- bottom - top -#' if (length(x) > 1) { -#' xord <- order(thk, decreasing = TRUE) -#' paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ") -#' } else x -#' }) -#' ) -#' profile_id(a2) <- paste0(profile_id(a), "_collapse_custom") +#' profile_id(i) <- paste0(profile_id(i), "_collapse") +#' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1) +#' +#' # custom pattern argument +#' +#' j <- collapseHz(jacobs2000, +#' c( +#' `A` = "^A", +#' `E` = "E", +#' `Bt` = "[ABC]+t", +#' `C` = "^C", +#' `foo` = "bar" +#' )) +#' profile_id(j) <- paste0(profile_id(j), "_collapse") +#' plot(c(j, jacobs2000), color = "clay") #' -#' unique(a2$matrix_color_munsell) +#' # custom aggregation function for matrix_color_munsell +#' k <- collapseHz(jacobs2000, +#' pattern = c( +#' `A` = "^A", +#' `E` = "E", +#' `Bt` = "[ABC]+t", +#' `C` = "^C", +#' `foo` = "bar" +#' ), +#' AGGFUN = list( +#' matrix_color_munsell = function(x, top, bottom) { +#' thk <- bottom - top +#' if (length(x) > 1) { +#' xord <- order(thk, decreasing = TRUE) +#' paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ") +#' } else +#' x +#' } +#' ) +#' ) +#' profile_id(k) <- paste0(profile_id(k), "_collapse_custom") #' +#' unique(k$matrix_color_munsell) +# collapseHz <- function(x, - pattern, + pattern = NULL, hzdesgn = hzdesgnname(x, required = TRUE), FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), ..., @@ -79,6 +100,11 @@ collapseHz <- function(x, na.rm = FALSE) { idn <- idname(x) hzd <- horizonDepths(x) + + if (is.null(pattern)) { + pattern <- unique(as.character(x[[GHL(x, required = TRUE)]])) + } + if (!is.null(names(pattern))) { labels <- names(pattern) pattern <- as.character(pattern) @@ -92,44 +118,64 @@ collapseHz <- function(x, if (any(l)) { r <- rle(l) g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i]))) - gidx <- g %in% unique(g[l]) - res <- h[gidx, c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), - hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)), + hidx <- unlist(sapply(seq(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l + gidx <- g %in% unique(g[l]) & !hidx + + res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), + hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))), sapply(colnames(.SD)[!colnames(.SD) %in% hzd], function(n, top, bottom) { v <- .SD[[n]] - if (n %in% names(AGGFUN)) { - - # custom aggregation function (column name specific) - AGGFUN[[n]](v, top, bottom) - - } else if (!n %in% ignore_numerics && is.numeric(x)) { - - # weighted average by thickness (numerics not in exclusion list) - weighted.mean(v, bottom - top, na.rm = na.rm) - + if (length(v) > 1) { + if (n %in% names(AGGFUN)) { + + # custom aggregation function (column name specific) + AGGFUN[[n]](v, top, bottom) + + } else if (!n %in% ignore_numerics && is.numeric(x)) { + + # weighted average by thickness (numerics not in exclusion list) + weighted.mean(v, bottom - top, na.rm = na.rm) + + } else { + # take thickest value + # v[which.max(bottom - top)[1]] + + # take dominant condition (based on sum of thickness) + cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm) + cond[[1]][which.max(cond[[2]])[1]] + } } else { - # take thickest value - # v[which.max(bottom - top)[1]] - - # take dominant condition (based on sum of thickness) - cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm) - cond[[1]][which.max(cond[[2]])[1]] + v } }, top = .SD[[hzd[1]]], bottom = .SD[[hzd[2]]])), by = g[gidx]] + res$g <- NULL - res[[hzdesgn]] <- labels[p] - h <- h[-which(g %in% unique(g[l])),] - h <- data.table::rbindlist(list(h, res), fill = TRUE) - h <- h[order(h[[idn]], h[[hzd[1]]]),] + + res2 <- h[hidx & l, ] + res2$hzdeptnew <- res2[[hzd[1]]] + res2$hzdepbnew <- res2[[hzd[2]]] + res2[[hzd[1]]] <- NULL + res2[[hzd[2]]] <- NULL + + res3 <- rbind(res, res2) + + res3[[hzdesgn]] <- labels[p] + + h <- h[-which(g %in% unique(g[l]) | hidx),] + h <- data.table::rbindlist(list(h, res3), fill = TRUE) + hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew) h[[hzd[1]]][hn] <- h$hzdeptnew[hn] h[[hzd[2]]][hn] <- h$hzdepbnew[hn] h$hzdeptnew <- NULL h$hzdepbnew <- NULL + + h <- h[order(h[[idn]], h[[hzd[1]]]),] + replaceHorizons(x) <- h } } diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd index 839a3d77a..a7093df8b 100644 --- a/man/collapseHz.Rd +++ b/man/collapseHz.Rd @@ -6,7 +6,7 @@ \usage{ collapseHz( x, - pattern, + pattern = NULL, hzdesgn = hzdesgnname(x, required = TRUE), FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), @@ -19,8 +19,8 @@ collapseHz( \arguments{ \item{x}{A \emph{SoilProfileCollection}} -\item{pattern}{\emph{character}. A regular expression pattern to match in \code{hzdesgn} -column} +\item{pattern}{\emph{character}. A regular expression pattern to match in +\code{hzdesgn} column. Default} \item{hzdesgn}{\emph{character}. Any character column containing horizon-level identifiers. Default is estimated using \code{guessHzDesgnName()}.} @@ -38,9 +38,9 @@ bottom depths). Default: \code{NULL} applies weighted.mean() to all numeric columns not listed in \code{ignore_numerics} and takes the thickest value for all other columns.} -\item{ignore_numerics}{\emph{character}. Vector of column names that contain numeric -values which should \emph{not} be aggregated using \code{weighted.mean()}. For example, -soil color "value" and "chroma".} +\item{ignore_numerics}{\emph{character}. Vector of column names that contain +numeric values which should \emph{not} be aggregated using \code{weighted.mean()}. +For example, soil color "value" and "chroma".} \item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} @@ -64,33 +64,53 @@ input SoilProfileCollection \code{x}. \examples{ data(jacobs2000) -a <- collapseHz(jacobs2000, c(`A` = "^A", - `E` = "E", - `Bt` = "[ABC]+t", - `C` = "^C", - `foo` = "bar")) -b <- jacobs2000 -profile_id(a) <- paste0(profile_id(a), "_collapse") +# use existing generalized horizon labels +new_labels <- c("A", "E", "Bt", "Bh", "C") +patterns <- c("A", "E", "B.*t", "B.*h", "C") -plot(c(a, b), color = "clay") +# calculate a new SPC with genhz column based on patterns +jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) -# custom aggregation function for matrix_color_munsell +# collapse that SPC based on genhz +i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") -a2 <- collapseHz(jacobs2000, c(`A` = "^A", - `E` = "E", - `Bt` = "[ABC]+t", - `C` = "^C", - `foo` = "bar"), - AGGFUN = list(matrix_color_munsell = function(x, top, bottom) { - thk <- bottom - top - if (length(x) > 1) { - xord <- order(thk, decreasing = TRUE) - paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ") - } else x - }) - ) -profile_id(a2) <- paste0(profile_id(a), "_collapse_custom") +profile_id(i) <- paste0(profile_id(i), "_collapse") +plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1) + +# custom pattern argument + +j <- collapseHz(jacobs2000, + c( + `A` = "^A", + `E` = "E", + `Bt` = "[ABC]+t", + `C` = "^C", + `foo` = "bar" + )) +profile_id(j) <- paste0(profile_id(j), "_collapse") +plot(c(j, jacobs2000), color = "clay") -unique(a2$matrix_color_munsell) +# custom aggregation function for matrix_color_munsell +k <- collapseHz(jacobs2000, + pattern = c( + `A` = "^A", + `E` = "E", + `Bt` = "[ABC]+t", + `C` = "^C", + `foo` = "bar" + ), + AGGFUN = list( + matrix_color_munsell = function(x, top, bottom) { + thk <- bottom - top + if (length(x) > 1) { + xord <- order(thk, decreasing = TRUE) + paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ") + } else + x + } + ) + ) +profile_id(k) <- paste0(profile_id(k), "_collapse_custom") +unique(k$matrix_color_munsell) } From 729e7c33c61c919b1258a1a2a9c94edf5ffec1c8 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Thu, 10 Oct 2024 19:20:12 -0700 Subject: [PATCH 07/22] add test + docs --- R/collapseHz.R | 7 ++----- man/collapseHz.Rd | 7 ++----- tests/testthat/test-collapseHz.R | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index a41c0d4a7..9db153b5b 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -40,21 +40,18 @@ #' @examples #' data(jacobs2000) #' -#' # use existing generalized horizon labels +#' # calculate a new SPC with genhz column based on patterns #' new_labels <- c("A", "E", "Bt", "Bh", "C") #' patterns <- c("A", "E", "B.*t", "B.*h", "C") -#' -#' # calculate a new SPC with genhz column based on patterns #' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) #' -#' # collapse that SPC based on genhz +#' # use existing generalized horizon labels #' i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") #' #' profile_id(i) <- paste0(profile_id(i), "_collapse") #' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1) #' #' # custom pattern argument -#' #' j <- collapseHz(jacobs2000, #' c( #' `A` = "^A", diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd index a7093df8b..fd98894ad 100644 --- a/man/collapseHz.Rd +++ b/man/collapseHz.Rd @@ -64,21 +64,18 @@ input SoilProfileCollection \code{x}. \examples{ data(jacobs2000) -# use existing generalized horizon labels +# calculate a new SPC with genhz column based on patterns new_labels <- c("A", "E", "Bt", "Bh", "C") patterns <- c("A", "E", "B.*t", "B.*h", "C") - -# calculate a new SPC with genhz column based on patterns jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) -# collapse that SPC based on genhz +# use existing generalized horizon labels i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") profile_id(i) <- paste0(profile_id(i), "_collapse") plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1) # custom pattern argument - j <- collapseHz(jacobs2000, c( `A` = "^A", diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R index 33d598364..222b582e0 100644 --- a/tests/testthat/test-collapseHz.R +++ b/tests/testthat/test-collapseHz.R @@ -1,5 +1,20 @@ test_that("collapseHz works", { data("jacobs2000", package = "aqp") + .BOTTOM <- NULL + + # use existing generalized horizon labels + new_labels <- c("A", "E", "Bt", "Bh", "C") + patterns <- c("A", "E", "B.*t", "B.*h", "C") + + # calculate a new SPC with genhz column based on patterns + jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) + + # collapse that SPC based on genhz + i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") + expect_equal(length(jacobs2000), length(i)) + expect_equal(nrow(i), 26) + expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152)) + x <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", From 17ed9490aec267dcd57224ab84668cf7c441a865 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Thu, 10 Oct 2024 20:29:02 -0700 Subject: [PATCH 08/22] allow multiple summary statistics in custom aggregation --- R/collapseHz.R | 104 +++++++++++++++++++++++++++++++++++++--------- man/collapseHz.Rd | 27 ++++++++++++ 2 files changed, 112 insertions(+), 19 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 9db153b5b..63d6b5e6b 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -86,6 +86,34 @@ #' profile_id(k) <- paste0(profile_id(k), "_collapse_custom") #' #' unique(k$matrix_color_munsell) +#' +#' # custom aggregation function for matrix_color_munsell (returns data.frame) +#' m <- collapseHz(jacobs2000, +#' pattern = c( +#' `A` = "^A", +#' `E` = "E", +#' `Bt` = "[ABC]+t", +#' `C` = "^C", +#' `foo` = "bar" +#' ), +#' AGGFUN = list( +#' matrix_color_munsell = function(x, top, bottom) { +#' thk <- bottom - top +#' if (length(x) > 1) { +#' xord <- order(thk, decreasing = TRUE) +#' data.frame(matrix_color_munsell = paste0(x, collapse = ";"), +#' n_matrix_color = length(x)) +#' } else { +#' data.frame(matrix_color_munsell = x, +#' n_matrix_color = length(x)) +#' } +#' } +#' ) +#' ) +#' profile_id(m) <- paste0(profile_id(m), "_collapse_custom") +#' +#' m$matrix_color_munsell.n_matrix_color +# # collapseHz <- function(x, pattern = NULL, @@ -98,46 +126,58 @@ collapseHz <- function(x, idn <- idname(x) hzd <- horizonDepths(x) - if (is.null(pattern)) { - pattern <- unique(as.character(x[[GHL(x, required = TRUE)]])) - } + # use exact match of existing genhz labels as default in lieu of pattern + if (is.null(pattern) & missing(matchcolumn)) { + existing_genhz <- unique(as.character(x[[GHL(x, required = TRUE)]])) + pattern <- paste0("^", existing_genhz, "$") + labels <- existing_genhz + } else if (!missing(matchcolumn)) { + pattern <- NA + } + # if a named vector of patterns is given, use the names as new labels if (!is.null(names(pattern))) { labels <- names(pattern) pattern <- as.character(pattern) } else { + # otherwise, the patterns and labels are the same pattern <- as.character(pattern) labels <- pattern } + + # iterate over patterns for (p in seq(pattern)) { + h <- data.table::data.table(horizons(x)) + + # calculate matches l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...) + if (any(l)) { r <- rle(l) g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i]))) hidx <- unlist(sapply(seq(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l gidx <- g %in% unique(g[l]) & !hidx + naf <- names(AGGFUN) + # iterate over sets of layers needing aggregation within each matching group res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))), - sapply(colnames(.SD)[!colnames(.SD) %in% hzd], + + # process numeric depth weighted averages w/ dominant condition otherwise + sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)], function(n, top, bottom) { v <- .SD[[n]] if (length(v) > 1) { - if (n %in% names(AGGFUN)) { - - # custom aggregation function (column name specific) - AGGFUN[[n]](v, top, bottom) - - } else if (!n %in% ignore_numerics && is.numeric(x)) { - + if (!n %in% ignore_numerics && is.numeric(x)) { + # weighted average by thickness (numerics not in exclusion list) weighted.mean(v, bottom - top, na.rm = na.rm) - + } else { # take thickest value # v[which.max(bottom - top)[1]] - + # take dominant condition (based on sum of thickness) cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm) cond[[1]][which.max(cond[[2]])[1]] @@ -145,34 +185,60 @@ collapseHz <- function(x, } else { v } - }, - top = .SD[[hzd[1]]], - bottom = .SD[[hzd[2]]])), + }, + top = .SD[[hzd[1]]], + bottom = .SD[[hzd[2]]]), + + # process custom aggregation functions (may return data.frames) + do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], + function(n, top, bottom) { + out <- AGGFUN[[n]](.SD[[n]], top, bottom) + if (!is.data.frame(out)) { + out <- data.frame(out) + colnames(out) <- n + } else { + colnames(out) <- paste0(n, ".", colnames(out)) + } + out + }, + top = .SD[[hzd[1]]], + bottom = .SD[[hzd[2]]]))), by = g[gidx]] - res$g <- NULL + # allow for replacing values as well as adding new values with data.frame AGGFUN + test1.idx <- na.omit(match(colnames(res), paste0(naf, ".", naf))) + test2.idx <- na.omit(match(paste0(naf, ".", naf), colnames(res))) + colnames(res)[test2.idx] <- naf[test1.idx] + # remove grouping ID + res$g <- NULL + + # determine matches that are only a single layer (no aggregation applied) res2 <- h[hidx & l, ] res2$hzdeptnew <- res2[[hzd[1]]] res2$hzdepbnew <- res2[[hzd[2]]] res2[[hzd[1]]] <- NULL res2[[hzd[2]]] <- NULL - res3 <- rbind(res, res2) - + # combine matches + res3 <- data.table::rbindlist(list(res, res2), fill = TRUE) res3[[hzdesgn]] <- labels[p] + # combine matches with horizons that did not match h <- h[-which(g %in% unique(g[l]) | hidx),] h <- data.table::rbindlist(list(h, res3), fill = TRUE) + # replace depths hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew) h[[hzd[1]]][hn] <- h$hzdeptnew[hn] h[[hzd[2]]][hn] <- h$hzdepbnew[hn] h$hzdeptnew <- NULL h$hzdepbnew <- NULL + # sort horizons by id name and top depth h <- h[order(h[[idn]], h[[hzd[1]]]),] + # replace horizons in parent SPC replaceHorizons(x) <- h } } diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd index fd98894ad..32ea840dd 100644 --- a/man/collapseHz.Rd +++ b/man/collapseHz.Rd @@ -110,4 +110,31 @@ k <- collapseHz(jacobs2000, profile_id(k) <- paste0(profile_id(k), "_collapse_custom") unique(k$matrix_color_munsell) + +# custom aggregation function for matrix_color_munsell (returns data.frame) +m <- collapseHz(jacobs2000, + pattern = c( + `A` = "^A", + `E` = "E", + `Bt` = "[ABC]+t", + `C` = "^C", + `foo` = "bar" + ), + AGGFUN = list( + matrix_color_munsell = function(x, top, bottom) { + thk <- bottom - top + if (length(x) > 1) { + xord <- order(thk, decreasing = TRUE) + data.frame(matrix_color_munsell = paste0(x, collapse = ";"), + n_matrix_color = length(x)) + } else { + data.frame(matrix_color_munsell = x, + n_matrix_color = length(x)) + } + } + ) + ) +profile_id(m) <- paste0(profile_id(m), "_collapse_custom") + +m$matrix_color_munsell.n_matrix_color } From 21c3ad83781ff2e35415d6dd6ab985cfa45c2bf4 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Thu, 10 Oct 2024 22:02:40 -0700 Subject: [PATCH 09/22] implement simplified route for existing group IDs or labels --- R/collapseHz.R | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 63d6b5e6b..27ecafdeb 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -7,9 +7,12 @@ #' #' @param x A _SoilProfileCollection_ #' @param pattern _character_. A regular expression pattern to match in -#' `hzdesgn` column. Default +#' `hzdesgn` column. Default: `NULL`. +#' @param by _character_. A column name specifying horizons that should be +#' combined. Aggregation will be applied to adjacent groups of layers within +#' profiles that have the same value in `by`. #' @param hzdesgn _character_. Any character column containing horizon-level -#' identifiers. Default is estimated using `guessHzDesgnName()`. +#' identifiers. Default: `hzdesgnname(x, required = TRUE)()`. #' @param FUN _function_. A function that returns a _logical_ vector equal in #' length to the number of horizons in `x`. See details. #' @param ... Additional arguments passed to the matching function `FUN`. @@ -46,7 +49,7 @@ #' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) #' #' # use existing generalized horizon labels -#' i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") +#' i <- collapseHz(jacobs2000_gen, by = "genhz") #' #' profile_id(i) <- paste0(profile_id(i), "_collapse") #' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1) @@ -113,10 +116,9 @@ #' profile_id(m) <- paste0(profile_id(m), "_collapse_custom") #' #' m$matrix_color_munsell.n_matrix_color -# -# collapseHz <- function(x, pattern = NULL, + by = NULL, hzdesgn = hzdesgnname(x, required = TRUE), FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), ..., @@ -127,11 +129,11 @@ collapseHz <- function(x, hzd <- horizonDepths(x) # use exact match of existing genhz labels as default in lieu of pattern - if (is.null(pattern) & missing(matchcolumn)) { - existing_genhz <- unique(as.character(x[[GHL(x, required = TRUE)]])) - pattern <- paste0("^", existing_genhz, "$") - labels <- existing_genhz - } else if (!missing(matchcolumn)) { + if (is.null(pattern) & missing(by)) { + by <- GHL(x, required = TRUE) + } + + if (length(pattern) == 0) { pattern <- NA } @@ -151,12 +153,18 @@ collapseHz <- function(x, h <- data.table::data.table(horizons(x)) # calculate matches - l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...) - - if (any(l)) { + if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) { + labels <- h[[by]] + r <- rle(paste0(h[[idn]], "-", as.character(labels))) + l <- rep(TRUE, nrow(h)) + } else { + l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...) r <- rle(l) - g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i]))) - hidx <- unlist(sapply(seq(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l + } + + if (any(r$lengths > 1)) { + g <- unlist(lapply(seq_along(r$lengths), function(i) rep(i, r$lengths[i]))) + hidx <- unlist(lapply(seq_along(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l gidx <- g %in% unique(g[l]) & !hidx naf <- names(AGGFUN) @@ -179,7 +187,7 @@ collapseHz <- function(x, # v[which.max(bottom - top)[1]] # take dominant condition (based on sum of thickness) - cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm) + cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm) cond[[1]][which.max(cond[[2]])[1]] } } else { @@ -222,7 +230,9 @@ collapseHz <- function(x, # combine matches res3 <- data.table::rbindlist(list(res, res2), fill = TRUE) - res3[[hzdesgn]] <- labels[p] + if (missing(by)){ + res3[[hzdesgn]] <- labels[p] + } # combine matches with horizons that did not match h <- h[-which(g %in% unique(g[l]) | hidx),] From 9c3b432b3a2fb15018fd7f6709b122b5be1c1ceb Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 07:02:35 -0700 Subject: [PATCH 10/22] add test of `by` argument and custom AGGFUN with data.frame results --- R/collapseHz.R | 98 +++++++++++++++++--------------- tests/testthat/test-collapseHz.R | 38 +++++++++++-- 2 files changed, 85 insertions(+), 51 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 27ecafdeb..31bd6392a 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -169,58 +169,61 @@ collapseHz <- function(x, naf <- names(AGGFUN) # iterate over sets of layers needing aggregation within each matching group - res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), - hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))), - - # process numeric depth weighted averages w/ dominant condition otherwise - sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)], - function(n, top, bottom) { - v <- .SD[[n]] - if (length(v) > 1) { - if (!n %in% ignore_numerics && is.numeric(x)) { - - # weighted average by thickness (numerics not in exclusion list) - weighted.mean(v, bottom - top, na.rm = na.rm) - + if (sum(gidx) > 0){ + res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), + hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))), + + # process numeric depth weighted averages w/ dominant condition otherwise + sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)], + function(n, top, bottom) { + v <- .SD[[n]] + if (length(v) > 1) { + if (!n %in% ignore_numerics && is.numeric(x)) { + + # weighted average by thickness (numerics not in exclusion list) + weighted.mean(v, bottom - top, na.rm = na.rm) + + } else { + # take thickest value + # v[which.max(bottom - top)[1]] + + # take dominant condition (based on sum of thickness) + cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm) + cond[[1]][which.max(cond[[2]])[1]] + } } else { - # take thickest value - # v[which.max(bottom - top)[1]] - - # take dominant condition (based on sum of thickness) - cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm) - cond[[1]][which.max(cond[[2]])[1]] + v } - } else { - v - } - }, - top = .SD[[hzd[1]]], - bottom = .SD[[hzd[2]]]), - - # process custom aggregation functions (may return data.frames) - do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], - function(n, top, bottom) { - out <- AGGFUN[[n]](.SD[[n]], top, bottom) - if (!is.data.frame(out)) { - out <- data.frame(out) - colnames(out) <- n - } else { - colnames(out) <- paste0(n, ".", colnames(out)) - } - out - }, - top = .SD[[hzd[1]]], - bottom = .SD[[hzd[2]]]))), - by = g[gidx]] + }, + top = .SD[[hzd[1]]], + bottom = .SD[[hzd[2]]]), + + # process custom aggregation functions (may return data.frames) + do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], + function(n, top, bottom) { + out <- AGGFUN[[n]](.SD[[n]], top, bottom) + if (!is.data.frame(out)) { + out <- data.frame(out) + colnames(out) <- n + } else { + colnames(out) <- paste0(n, ".", colnames(out)) + } + out + }, + top = .SD[[hzd[1]]], + bottom = .SD[[hzd[2]]]))), + by = g[gidx]] + # remove grouping ID + res$g <- NULL + } else { + res <- h[0, ] + } # allow for replacing values as well as adding new values with data.frame AGGFUN test1.idx <- na.omit(match(colnames(res), paste0(naf, ".", naf))) test2.idx <- na.omit(match(paste0(naf, ".", naf), colnames(res))) colnames(res)[test2.idx] <- naf[test1.idx] - # remove grouping ID - res$g <- NULL - # determine matches that are only a single layer (no aggregation applied) res2 <- h[hidx & l, ] res2$hzdeptnew <- res2[[hzd[1]]] @@ -230,12 +233,15 @@ collapseHz <- function(x, # combine matches res3 <- data.table::rbindlist(list(res, res2), fill = TRUE) - if (missing(by)){ + if (missing(by) && nrow(res3) > 0){ res3[[hzdesgn]] <- labels[p] } # combine matches with horizons that did not match - h <- h[-which(g %in% unique(g[l]) | hidx),] + agg.idx <- which(g %in% unique(g[l]) | hidx) + if (length(agg.idx) > 0) { + h <- h[-agg.idx, ] + } h <- data.table::rbindlist(list(h, res3), fill = TRUE) # replace depths diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R index 222b582e0..9ec843617 100644 --- a/tests/testthat/test-collapseHz.R +++ b/tests/testthat/test-collapseHz.R @@ -11,15 +11,43 @@ test_that("collapseHz works", { # collapse that SPC based on genhz i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") + expect_equal(length(jacobs2000), length(i)) expect_equal(nrow(i), 26) expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152)) - x <- collapseHz(jacobs2000, c(`A` = "^A", - `E` = "E", - `Bt` = "[ABC]+t", - `C` = "^C", - `foo` = "bar")) + i <- collapseHz(jacobs2000_gen, by = "genhz") + expect_equal(length(jacobs2000), length(i)) + expect_equal(nrow(i), 26) + expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152)) + + a_pattern <- c(`A` = "^A", + `E` = "E", + `Bt` = "[ABC]+t", + `C` = "^C", + `foo` = "bar") + x <- collapseHz(jacobs2000, a_pattern) expect_equal(length(jacobs2000), length(x)) expect_equal(nrow(x), 29) + + m <- collapseHz(jacobs2000, + pattern = a_pattern, + AGGFUN = list( + matrix_color_munsell = function(x, top, bottom) { + thk <- bottom - top + if (length(x) > 1) { + xord <- order(thk, decreasing = TRUE) + data.frame(matrix_color_munsell = paste0(x, collapse = ";"), + n_matrix_color = length(x)) + } else { + data.frame(matrix_color_munsell = x, + n_matrix_color = length(x)) + } + } + ) + ) + profile_id(m) <- paste0(profile_id(m), "_collapse_custom") + + expect_true(all(c("matrix_color_munsell", "matrix_color_munsell.n_matrix_color") %in% names(m))) + expect_equal(nrow(m), 29) }) From f01ffe2ff783b42985e6e8c49e8dd9bad78f46c5 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 07:20:26 -0700 Subject: [PATCH 11/22] doc --- R/collapseHz.R | 78 ++++++++++++++++++++++++++----------------- man/collapseHz.Rd | 84 ++++++++++++++++++++++++++++++----------------- 2 files changed, 100 insertions(+), 62 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 31bd6392a..ec470ef07 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -1,43 +1,52 @@ #' Collapse Horizons within Profiles Based on Pattern Matching #' -#' Combines and aggregates data for layers by grouping adjacent horizons that -#' match `pattern` in `hzdesgn`. Numeric properties are combined using the -#' weighted average, and other properties are derived from the thickest horizon -#' in each group. +#' Combines layers and aggregates data by grouping adjacent horizons which match `pattern` in +#' `hzdesgn` or, alternately, share a common value in `by` argument. Numeric properties are combined +#' using the weighted average, and other properties are derived from the dominant condition based on +#' thickness of layers and values in each group. #' #' @param x A _SoilProfileCollection_ -#' @param pattern _character_. A regular expression pattern to match in -#' `hzdesgn` column. Default: `NULL`. -#' @param by _character_. A column name specifying horizons that should be -#' combined. Aggregation will be applied to adjacent groups of layers within -#' profiles that have the same value in `by`. -#' @param hzdesgn _character_. Any character column containing horizon-level -#' identifiers. Default: `hzdesgnname(x, required = TRUE)()`. -#' @param FUN _function_. A function that returns a _logical_ vector equal in -#' length to the number of horizons in `x`. See details. +#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column. Default: +#' `NULL`. +#' @param by _character_. A column name specifying horizons that should be combined. Aggregation +#' will be applied to adjacent groups of layers within profiles that have the same value in `by`. +#' Used in lieu of `pattern` and `hzdesgn`. Default: `NULL`. +#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default: +#' `hzdesgnname(x, required = TRUE)`. +#' @param FUN _function_. A function that returns a _logical_ vector equal in length to the number +#' of horizons in `x`. Used only when `pattern` is specified. See details. #' @param ... Additional arguments passed to the matching function `FUN`. -#' @param AGGFUN _list_. A named list containing custom aggregation functions. -#' List element names should match the column name that they transform. The -#' functions defined should take three arguments: `x` (a vector of horizon -#' property values), `top` (a vector of top depths), and `bottom` (a vector of -#' bottom depths). Default: `NULL` applies weighted.mean() to all numeric -#' columns not listed in `ignore_numerics` and takes the thickest value for all -#' other columns. -#' @param ignore_numerics _character_. Vector of column names that contain -#' numeric values which should _not_ be aggregated using `weighted.mean()`. -#' For example, soil color "value" and "chroma". -#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating -#' min/max boundaries for each group and in weighted averages. If `FALSE` `NA` -#' values are propagated to the result. Default: `FALSE` +#' @param AGGFUN _list_. A _named_ list containing custom aggregation functions. List element names +#' should match the column name that they transform. The functions defined should take three +#' arguments: `x` (a vector of horizon property values), `top` (a vector of top depths), and +#' `bottom` (a vector of bottom depths). Default: `NULL` applies `weighted.mean()` to all numeric +#' columns not listed in `ignore_numerics` and takes the dominant condition (value with greatest +#' aggregate thickness sum) for all other columns. See details. +#' @param ignore_numerics _character_. Vector of column names that contain numeric values which +#' should _not_ be aggregated using `weighted.mean()`. For example, soil color "value" and +#' "chroma". +#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for +#' each group and in weighted averages. If `FALSE` `NA` values are propagated to the result. +#' Default: `FALSE`. #' #' @details #' -#' If a custom function (`FUN`) is used, it should accept arbitrary additional -#' arguments via an ellipsis (`...`). It is not necessary to do anything with -#' arguments, but the result should match the number of horizons found in the -#' input SoilProfileCollection `x`. +#' If a custom matching function (`FUN`) is used, it should accept arbitrary additional arguments +#' via an ellipsis (`...`). It is not necessary to do anything with arguments, but the result should +#' match the number of horizons found in the input SoilProfileCollection `x`. #' +#' Custom aggregation functions defined in the `AGGFUN` argument should either return a single +#' vector value for each group*column combination, or should return a _data.frame_ object with named +#' columns. If the input column name is used as a column name in the result _data.frame_, then the +#' values of that column name in the result _SoilProfileCollection_ will be replaced by the output +#' of the aggregation function. See examples. +#' #' @return A _SoilProfileCollection_ +#' +#' @author Andrew G. Brown +#' +#' @seealso `hz_dissolve()` +#' #' @export #' #' @examples @@ -52,7 +61,14 @@ #' i <- collapseHz(jacobs2000_gen, by = "genhz") #' #' profile_id(i) <- paste0(profile_id(i), "_collapse") -#' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1) +#' +#' plot( +#' c(i, jacobs2000), +#' color = "genhz", +#' name = "name", +#' name.style = "center-center", +#' cex.names = 1 +#' ) #' #' # custom pattern argument #' j <- collapseHz(jacobs2000, diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd index 32ea840dd..a13a863a7 100644 --- a/man/collapseHz.Rd +++ b/man/collapseHz.Rd @@ -7,6 +7,7 @@ collapseHz( x, pattern = NULL, + by = NULL, hzdesgn = hzdesgnname(x, required = TRUE), FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), @@ -19,47 +20,55 @@ collapseHz( \arguments{ \item{x}{A \emph{SoilProfileCollection}} -\item{pattern}{\emph{character}. A regular expression pattern to match in -\code{hzdesgn} column. Default} +\item{pattern}{\emph{character}. A regular expression pattern to match in \code{hzdesgn} column. Default: +\code{NULL}.} -\item{hzdesgn}{\emph{character}. Any character column containing horizon-level -identifiers. Default is estimated using \code{guessHzDesgnName()}.} +\item{by}{\emph{character}. A column name specifying horizons that should be combined. Aggregation +will be applied to adjacent groups of layers within profiles that have the same value in \code{by}. +Used in lieu of \code{pattern} and \code{hzdesgn}. Default: \code{NULL}.} -\item{FUN}{\emph{function}. A function that returns a \emph{logical} vector equal in -length to the number of horizons in \code{x}. See details.} +\item{hzdesgn}{\emph{character}. Any character column containing horizon-level identifiers. Default: +\code{hzdesgnname(x, required = TRUE)}.} + +\item{FUN}{\emph{function}. A function that returns a \emph{logical} vector equal in length to the number +of horizons in \code{x}. Used only when \code{pattern} is specified. See details.} \item{...}{Additional arguments passed to the matching function \code{FUN}.} -\item{AGGFUN}{\emph{list}. A named list containing custom aggregation functions. -List element names should match the column name that they transform. The -functions defined should take three arguments: \code{x} (a vector of horizon -property values), \code{top} (a vector of top depths), and \code{bottom} (a vector of -bottom depths). Default: \code{NULL} applies weighted.mean() to all numeric -columns not listed in \code{ignore_numerics} and takes the thickest value for all -other columns.} - -\item{ignore_numerics}{\emph{character}. Vector of column names that contain -numeric values which should \emph{not} be aggregated using \code{weighted.mean()}. -For example, soil color "value" and "chroma".} - -\item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating -min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} -values are propagated to the result. Default: \code{FALSE}} +\item{AGGFUN}{\emph{list}. A \emph{named} list containing custom aggregation functions. List element names +should match the column name that they transform. The functions defined should take three +arguments: \code{x} (a vector of horizon property values), \code{top} (a vector of top depths), and +\code{bottom} (a vector of bottom depths). Default: \code{NULL} applies \code{weighted.mean()} to all numeric +columns not listed in \code{ignore_numerics} and takes the dominant condition (value with greatest +aggregate thickness sum) for all other columns. See details.} + +\item{ignore_numerics}{\emph{character}. Vector of column names that contain numeric values which +should \emph{not} be aggregated using \code{weighted.mean()}. For example, soil color "value" and +"chroma".} + +\item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for +each group and in weighted averages. If \code{FALSE} \code{NA} values are propagated to the result. +Default: \code{FALSE}.} } \value{ A \emph{SoilProfileCollection} } \description{ -Combines and aggregates data for layers by grouping adjacent horizons that -match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the -weighted average, and other properties are derived from the thickest horizon -in each group. +Combines layers and aggregates data by grouping adjacent horizons which match \code{pattern} in +\code{hzdesgn} or, alternately, share a common value in \code{by} argument. Numeric properties are combined +using the weighted average, and other properties are derived from the dominant condition based on +thickness of layers and values in each group. } \details{ -If a custom function (\code{FUN}) is used, it should accept arbitrary additional -arguments via an ellipsis (\code{...}). It is not necessary to do anything with -arguments, but the result should match the number of horizons found in the -input SoilProfileCollection \code{x}. +If a custom matching function (\code{FUN}) is used, it should accept arbitrary additional arguments +via an ellipsis (\code{...}). It is not necessary to do anything with arguments, but the result should +match the number of horizons found in the input SoilProfileCollection \code{x}. + +Custom aggregation functions defined in the \code{AGGFUN} argument should either return a single +vector value for each group*column combination, or should return a \emph{data.frame} object with named +columns. If the input column name is used as a column name in the result \emph{data.frame}, then the +values of that column name in the result \emph{SoilProfileCollection} will be replaced by the output +of the aggregation function. See examples. } \examples{ data(jacobs2000) @@ -70,10 +79,17 @@ patterns <- c("A", "E", "B.*t", "B.*h", "C") jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) # use existing generalized horizon labels -i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") +i <- collapseHz(jacobs2000_gen, by = "genhz") profile_id(i) <- paste0(profile_id(i), "_collapse") -plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1) + +plot( + c(i, jacobs2000), + color = "genhz", + name = "name", + name.style = "center-center", + cex.names = 1 +) # custom pattern argument j <- collapseHz(jacobs2000, @@ -138,3 +154,9 @@ profile_id(m) <- paste0(profile_id(m), "_collapse_custom") m$matrix_color_munsell.n_matrix_color } +\seealso{ +\code{hz_dissolve()} +} +\author{ +Andrew G. Brown +} From 009801059e1e3990c9417206ec31b54328fa529c Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 09:15:12 -0700 Subject: [PATCH 12/22] add comment about when aggregation is used --- R/collapseHz.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/collapseHz.R b/R/collapseHz.R index ec470ef07..b08fb1ff1 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -178,6 +178,7 @@ collapseHz <- function(x, r <- rle(l) } + # only apply aggregation if there are adjacent horizons that match the target criteria if (any(r$lengths > 1)) { g <- unlist(lapply(seq_along(r$lengths), function(i) rep(i, r$lengths[i]))) hidx <- unlist(lapply(seq_along(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l From 424ebbe86f4f6e4b112691f3481a5d50a6c61d3d Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 10:19:54 -0700 Subject: [PATCH 13/22] move horizon extraction and replacement outside the loop --- R/collapseHz.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index b08fb1ff1..9492bfc87 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -163,11 +163,11 @@ collapseHz <- function(x, labels <- pattern } + h <- data.table::data.table(horizons(x)) + # iterate over patterns for (p in seq(pattern)) { - h <- data.table::data.table(horizons(x)) - # calculate matches if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) { labels <- h[[by]] @@ -271,9 +271,10 @@ collapseHz <- function(x, # sort horizons by id name and top depth h <- h[order(h[[idn]], h[[hzd[1]]]),] - # replace horizons in parent SPC - replaceHorizons(x) <- h } + + # replace horizons in parent SPC + replaceHorizons(x) <- h } x } From 489d2a815c7c49c56453f965da15a20c7358b8e1 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 13:11:32 -0700 Subject: [PATCH 14/22] fix issue with aggregated colums always returning character + test --- R/collapseHz.R | 77 ++++++++++++++++---------------- tests/testthat/test-collapseHz.R | 11 ++++- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index 9492bfc87..868712f90 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -192,43 +192,44 @@ collapseHz <- function(x, # process numeric depth weighted averages w/ dominant condition otherwise sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)], - function(n, top, bottom) { - v <- .SD[[n]] - if (length(v) > 1) { - if (!n %in% ignore_numerics && is.numeric(x)) { - - # weighted average by thickness (numerics not in exclusion list) - weighted.mean(v, bottom - top, na.rm = na.rm) - - } else { - # take thickest value - # v[which.max(bottom - top)[1]] - - # take dominant condition (based on sum of thickness) - cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm) - cond[[1]][which.max(cond[[2]])[1]] + function(n, top, bottom) { + v <- .SD[[n]] + if (length(v) > 1) { + if (!n %in% ignore_numerics && is.numeric(v)) { + + # weighted average by thickness (numerics not in exclusion list) + v <- weighted.mean(v, bottom - top, na.rm = na.rm) + + } else { + # take thickest value + # v[which.max(bottom - top)[1]] + + # take dominant condition (based on sum of thickness) + cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm) + v <- cond[[1]][which.max(cond[[2]])[1]] + } } - } else { - v - } - }, - top = .SD[[hzd[1]]], - bottom = .SD[[hzd[2]]]), + out <- data.frame(v) + colnames(out) <- n + out + }, + top = .SD[[hzd[1]]], + bottom = .SD[[hzd[2]]]), - # process custom aggregation functions (may return data.frames) - do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], - function(n, top, bottom) { - out <- AGGFUN[[n]](.SD[[n]], top, bottom) - if (!is.data.frame(out)) { - out <- data.frame(out) - colnames(out) <- n - } else { - colnames(out) <- paste0(n, ".", colnames(out)) - } - out - }, - top = .SD[[hzd[1]]], - bottom = .SD[[hzd[2]]]))), + # process custom aggregation functions (may return data.frames) + do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], + function(n, top, bottom) { + out <- AGGFUN[[n]](.SD[[n]], top, bottom) + if (!is.data.frame(out)) { + out <- data.frame(out) + colnames(out) <- n + } else { + colnames(out) <- paste0(n, ".", colnames(out)) + } + out + }, + top = .SD[[hzd[1]]], + bottom = .SD[[hzd[2]]]))), by = g[gidx]] # remove grouping ID res$g <- NULL @@ -237,9 +238,9 @@ collapseHz <- function(x, } # allow for replacing values as well as adding new values with data.frame AGGFUN - test1.idx <- na.omit(match(colnames(res), paste0(naf, ".", naf))) - test2.idx <- na.omit(match(paste0(naf, ".", naf), colnames(res))) - colnames(res)[test2.idx] <- naf[test1.idx] + test1.idx <- na.omit(match(colnames(res), paste0(colnames(h), ".", colnames(h)))) + test2.idx <- na.omit(match(paste0(colnames(h), ".", colnames(h)), colnames(res))) + colnames(res)[test2.idx] <- colnames(h)[test1.idx] # determine matches that are only a single layer (no aggregation applied) res2 <- h[hidx & l, ] diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R index 9ec843617..f0ada978d 100644 --- a/tests/testthat/test-collapseHz.R +++ b/tests/testthat/test-collapseHz.R @@ -11,15 +11,23 @@ test_that("collapseHz works", { # collapse that SPC based on genhz i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") - expect_equal(length(jacobs2000), length(i)) expect_equal(nrow(i), 26) expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152)) + # collapses adjacent horizons with same label i <- collapseHz(jacobs2000_gen, by = "genhz") + + # no effect, horizon designations are unique within profiles + j <- collapseHz(jacobs2000_gen, by = "name") + expect_equal(length(jacobs2000), length(i)) expect_equal(nrow(i), 26) + expect_equal(nrow(j), 46) expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152)) + expect_equal(j[7, , .BOTTOM], jacobs2000[7, , .BOTTOM]) + expect_true(is.numeric(i$clay)) + expect_true(is.numeric(j$clay)) a_pattern <- c(`A` = "^A", `E` = "E", @@ -29,6 +37,7 @@ test_that("collapseHz works", { x <- collapseHz(jacobs2000, a_pattern) expect_equal(length(jacobs2000), length(x)) expect_equal(nrow(x), 29) + expect_true(is.numeric(x$clay)) m <- collapseHz(jacobs2000, pattern = a_pattern, From 5bcac0734a6b941065f0c34e630b8e29c0c96bb4 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 13:54:19 -0700 Subject: [PATCH 15/22] add weighted average and dominant condition tests, with and without NA --- tests/testthat/test-collapseHz.R | 36 ++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R index f0ada978d..5893ef955 100644 --- a/tests/testthat/test-collapseHz.R +++ b/tests/testthat/test-collapseHz.R @@ -9,6 +9,9 @@ test_that("collapseHz works", { # calculate a new SPC with genhz column based on patterns jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) + # create a missing value + jacobs2000_gen$clay[19] <- NA + # collapse that SPC based on genhz i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz") expect_equal(length(jacobs2000), length(i)) @@ -17,15 +20,44 @@ test_that("collapseHz works", { # collapses adjacent horizons with same label i <- collapseHz(jacobs2000_gen, by = "genhz") + ii <- collapseHz(jacobs2000_gen, by = "genhz", na.rm = TRUE) # no effect, horizon designations are unique within profiles j <- collapseHz(jacobs2000_gen, by = "name") + expect_equal(nrow(j), 46) + expect_equal(j[7, , .BOTTOM], jacobs2000[7, , .BOTTOM]) + + # if using `by` argument, all values must not be NA + expect_error(collapseHz(jacobs2000_gen, by = "matrix_color_munsell"), + "Missing values are not allowed") + + # matches input number of profiles expect_equal(length(jacobs2000), length(i)) + + # horizons have been collapsed expect_equal(nrow(i), 26) - expect_equal(nrow(j), 46) + + # weighted mean (no NA values) works as expected (clay=47.15) + expect_equal(i$clay[4], + weighted.mean(jacobs2000_gen$clay[6:7], (jacobs2000_gen$bottom - jacobs2000_gen$top)[6:7])) + + # weighted mean (contains NA values, na.rm=FALSE) (clay is NA) + expect_true(is.na(i$clay[11])) + + # weighted mean (contains NA values, na.rm=TRUE, clay=18.72414) + expect_equal(ii$clay[11], + weighted.mean(jacobs2000_gen$clay[17:20], (jacobs2000_gen$bottom - jacobs2000_gen$top)[17:20], na.rm = TRUE)) + + # dominant condition (NA values retained) + expect_true(is.na(i$depletion_munsell[13])) + + # dominant condition (NA values removed) + expect_equal(ii$depletion_munsell[13], "10YR 8/2") + + plot(jacobs2000_gen, color = "concentration_pct") + expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152)) - expect_equal(j[7, , .BOTTOM], jacobs2000[7, , .BOTTOM]) expect_true(is.numeric(i$clay)) expect_true(is.numeric(j$clay)) From 268497df13e5151ffc075beda411e7a6306d091c Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 13:54:43 -0700 Subject: [PATCH 16/22] NA in `by` argument not allowed --- R/collapseHz.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/collapseHz.R b/R/collapseHz.R index 868712f90..a7c9f8137 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -171,6 +171,11 @@ collapseHz <- function(x, # calculate matches if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) { labels <- h[[by]] + + if (any(is.na(labels))) { + stop("Missing values are not allowed in `by` column argument", call. = FALSE) + } + r <- rle(paste0(h[[idn]], "-", as.character(labels))) l <- rep(TRUE, nrow(h)) } else { From 1a848832eb4c6b8b6b70d06eccc786b59a5f1749 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 13:55:20 -0700 Subject: [PATCH 17/22] handle NA values (when `na.rm=FALSE`) in aggregation of thickness for dominant condition --- R/collapseHz.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/R/collapseHz.R b/R/collapseHz.R index a7c9f8137..5e24de0ac 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -209,9 +209,22 @@ collapseHz <- function(x, # take thickest value # v[which.max(bottom - top)[1]] + # convert factors etc to character + # results may not conform with existing factor levels + v <- as.character(v) + + # replace NA values for use in aggregate() + if (!na.rm) { + v[is.na(v)] <- "" + } + # take dominant condition (based on sum of thickness) - cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm) + cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm) v <- cond[[1]][which.max(cond[[2]])[1]] + + if (!na.rm) { + v[v == ""] <- NA + } } } out <- data.frame(v) From 2902a8a8b922301e5868252ddf8792fdfb233e0d Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 14:10:59 -0700 Subject: [PATCH 18/22] tests of empty SPC, filled SPC, missing `by` column --- R/collapseHz.R | 5 +++++ tests/testthat/test-collapseHz.R | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/R/collapseHz.R b/R/collapseHz.R index 5e24de0ac..3e56dbb88 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -170,6 +170,11 @@ collapseHz <- function(x, # calculate matches if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) { + + if (!by %in% horizonNames(x)) { + stop("Column name `by` (\"", by, ") is not a horizon-level variable.", call. = FALSE) + } + labels <- h[[by]] if (any(is.na(labels))) { diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R index 5893ef955..d186a8c05 100644 --- a/tests/testthat/test-collapseHz.R +++ b/tests/testthat/test-collapseHz.R @@ -32,6 +32,9 @@ test_that("collapseHz works", { expect_error(collapseHz(jacobs2000_gen, by = "matrix_color_munsell"), "Missing values are not allowed") + # `by` column must also be a horizon-level variable + expect_error(collapseHz(jacobs2000, by = "genhz"), "not a horizon-level variable") + # matches input number of profiles expect_equal(length(jacobs2000), length(i)) @@ -61,6 +64,19 @@ test_that("collapseHz works", { expect_true(is.numeric(i$clay)) expect_true(is.numeric(j$clay)) + # "works" on empty SPC () + expect_equal(nrow(collapseHz(jacobs2000_gen[0,], by = "genhz")), 0) + + # works on SPC with filled profile (1 horizon with NA depths) + all_na <- subsetHz(jacobs2000_gen[1,], TRUE) + all_na$top <- NA + all_na$bottom <- NA + expect_warning(na_nonna <- c(all_na, jacobs2000_gen[2:5,])) + expect_silent(f <- collapseHz(all_na, by = "genhz")) + expect_silent(n <- collapseHz(na_nonna, by = "genhz")) + expect_equal(nrow(n), 14) + + a_pattern <- c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", From 80dc40f9f70f7516dffdbf9876e266a061633d77 Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 14:49:33 -0700 Subject: [PATCH 19/22] add depth screening function to guide user to `checkHzDepthLogic()` --- R/SoilProfileCollection-setters.R | 15 +++++++++++++++ R/collapseHz.R | 2 ++ tests/testthat/test-collapseHz.R | 9 +++++---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/R/SoilProfileCollection-setters.R b/R/SoilProfileCollection-setters.R index f05bc454d..0322fc002 100644 --- a/R/SoilProfileCollection-setters.R +++ b/R/SoilProfileCollection-setters.R @@ -90,6 +90,18 @@ setReplaceMethod("depths", "data.frame", return(depth) } +.checkDepthOrder <- function(x, depthcols) { + if (any(x[[depthcols[2]]] < x[[depthcols[1]]], na.rm = TRUE)) { + warning("One or more horizon bottom depths are shallower than top depth. Check depth logic with aqp::checkHzDepthLogic()", call. = FALSE) + } +} + +.screenDepths <- function(x, depthcols = horizonDepths(x)) { + .checkNAdepths(x[[depthcols[1]]], "top") + .checkNAdepths(x[[depthcols[2]]], "bottom") + .checkDepthOrder(x, depthcols) +} + # create 0-length spc from id and horizon depth columns (`idn`, `hzd`) # - allows template horizon (`hz`) and site (`st`) data to be provided (for additional columns) .prototypeSPC <- function(idn, hzd, @@ -178,6 +190,9 @@ setReplaceMethod("depths", "data.frame", data[[depthcols[1]]] <- .checkNAdepths(data[[depthcols[1]]], "top") data[[depthcols[2]]] <- .checkNAdepths(data[[depthcols[2]]], "bottom") + # warn if bottom depth shallower than top (old style O horizons, data entry issues, etc.) + .checkDepthOrder(data, depthcols) + tdep <- data[[depthcols[1]]] # calculate ID-top depth order, re-order input data diff --git a/R/collapseHz.R b/R/collapseHz.R index 3e56dbb88..2f6cc52a8 100644 --- a/R/collapseHz.R +++ b/R/collapseHz.R @@ -144,6 +144,8 @@ collapseHz <- function(x, idn <- idname(x) hzd <- horizonDepths(x) + .screenDepths(x, hzd) + # use exact match of existing genhz labels as default in lieu of pattern if (is.null(pattern) & missing(by)) { by <- GHL(x, required = TRUE) diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R index d186a8c05..58c946234 100644 --- a/tests/testthat/test-collapseHz.R +++ b/tests/testthat/test-collapseHz.R @@ -69,11 +69,12 @@ test_that("collapseHz works", { # works on SPC with filled profile (1 horizon with NA depths) all_na <- subsetHz(jacobs2000_gen[1,], TRUE) - all_na$top <- NA - all_na$bottom <- NA + all_na$top <- NA_real_ + all_na$bottom <- NA_real_ expect_warning(na_nonna <- c(all_na, jacobs2000_gen[2:5,])) - expect_silent(f <- collapseHz(all_na, by = "genhz")) - expect_silent(n <- collapseHz(na_nonna, by = "genhz")) + expect_warning(f <- collapseHz(all_na, by = "genhz"), "contain NA") + na_nonna$top[2] <- 19 + expect_warning(n <- collapseHz(na_nonna, by = "genhz"), "bottom depths are shallower than top") expect_equal(nrow(n), 14) From d9a567bcd28acff54de513b01716782aa0fd2e7b Mon Sep 17 00:00:00 2001 From: Beaudette Date: Fri, 11 Oct 2024 15:04:48 -0700 Subject: [PATCH 20/22] test context --- tests/testthat/test-collapseHz.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R index 58c946234..2fbfb3e14 100644 --- a/tests/testthat/test-collapseHz.R +++ b/tests/testthat/test-collapseHz.R @@ -1,3 +1,5 @@ +context("collapseHz()") + test_that("collapseHz works", { data("jacobs2000", package = "aqp") .BOTTOM <- NULL From 73c848bfe95f9c8729ef126a162600c8958e9fcd Mon Sep 17 00:00:00 2001 From: Andrew Gene Brown Date: Fri, 11 Oct 2024 15:14:21 -0700 Subject: [PATCH 21/22] Update NEWS --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1f8ab529d..a8ae9c372 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,8 @@ * `munsell2rgb()` now safely selects the closest Munsell value and chroma to those available in the package LUT * new function `soilTextureColorPal()` for suggesting a color palette suitable for soil texture class * **Breaking Change**: `@sp` slot of the SoilProfileCollection object, and dependency on sp package, has been removed. - * Any SoilProfileCollection objects previously written to file (.rda, .rds) with aqp <2.1.x will need to be rebuilt using `rebuildSPC()` due to changes to S4 object structure + * Any SoilProfileCollection objects previously written to file (.rda, .rds) with aqp <2.1.x will need to be rebuilt using `rebuildSPC()` due to changes to S4 object structure + * new function `collapseHz()` combines and aggregates data for adjacent horizons matching a pattern or sharing a common ID # aqp 2.0.4 (2024-07-30) * CRAN release From 23b67f3a555204c5e06adc3ab2ce33e69ae90219 Mon Sep 17 00:00:00 2001 From: Beaudette Date: Fri, 11 Oct 2024 15:34:42 -0700 Subject: [PATCH 22/22] Create collapseHz-mixMunsell-examples.R --- misc/sandbox/collapseHz-mixMunsell-examples.R | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 misc/sandbox/collapseHz-mixMunsell-examples.R diff --git a/misc/sandbox/collapseHz-mixMunsell-examples.R b/misc/sandbox/collapseHz-mixMunsell-examples.R new file mode 100644 index 000000000..c22448a3d --- /dev/null +++ b/misc/sandbox/collapseHz-mixMunsell-examples.R @@ -0,0 +1,94 @@ +library(aqp) + +# example data +data("jacobs2000") + +# local copy +g <- jacobs2000 + +# spike some horizon colors with green / blue hues +g$matrix_color_munsell[4] <- '5G 4/6' +g$matrix_color_munsell[29] <- '5B 4/6' +g$matrix_color_munsell[36] <- '5R 4/6' + +# horizon correlation patterns +# applied to horizon desingation +a_pattern <- c(`A` = "^A", + `E` = "E", + `Bt` = "[B]+t", + `Bh` = "[B]+h", + `C` = "^C", + `foo` = "bar") + + +# safe wrapper around mixMunsell() +mixFun <- function(x, top, bottom) { + # weights + w <- bottom - top + + # index to non-NA values + .idx <- which(! is.na(x)) + .n <- length(x[.idx]) + + # if all NA, return NA + if(.n < 1) { + return(NA) + + # if only a single color, return that + } else if (.n == 1){ + print('just 1!') + return(x[.idx]) + + } else { + # mix colors, retain only munsell notation + .res <- mixMunsell(x[.idx], w[.idx], mixingMethod = 'exact')$munsell + return(.res) + } +} + +# collapse according to patterns +m <- collapseHz(g, + pattern = a_pattern, + AGGFUN = list( + matrix_color_munsell = mixFun + ) +) + +# new profile IDs so we can safely combine with source data +profile_id(m) <- sprintf("%s-c", profile_id(m)) + +# combine +z <- c(g, m) + +# convert Munsell colors -> sRGB in hex notation +z$soilcolor <- parseMunsell(z$matrix_color_munsell) + +# plot combined collection +par(mar = c(0, 0, 0, 3)) +plotSPC(z, color = 'soilcolor', name = 'name', name.style = 'center-center', width = 0.35, cex.names = 0.75) + +## start fresh + +# combine all horizons by profile + +g <- jacobs2000 +horizons(g)$.all <- 'soil' +collapseHz(g, by = '.all') + + +m <- collapseHz(g, + by = '.all', + AGGFUN = list( + matrix_color_munsell = mixFun + ) +) + +profile_id(m) <- sprintf("%s-c", profile_id(m)) +z <- c(g, m) +z$soilcolor <- parseMunsell(z$matrix_color_munsell) + +# neat +par(mar = c(0, 0, 0, 3)) +plotSPC(z, color = 'soilcolor', name = 'name', name.style = 'center-center', width = 0.35, cex.names = 0.75) + +