From ee45cf20c338dd39971edade33758dc48cacfb8f Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <brown.andrewg@gmail.com>
Date: Sat, 24 Feb 2024 09:37:32 -0800
Subject: [PATCH 01/22] Add `collapseHz()`

---
 NAMESPACE         |  1 +
 R/collapseHz.R    | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 man/collapseHz.Rd | 37 ++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+)
 create mode 100644 R/collapseHz.R
 create mode 100644 man/collapseHz.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 8e148c846..e9f8563ef 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -24,6 +24,7 @@ export(buntley.westin.index)
 export(checkHzDepthLogic)
 export(checkSPC)
 export(col2Munsell)
+export(collapseHz)
 export(colorChart)
 export(colorContrast)
 export(colorContrastPlot)
diff --git a/R/collapseHz.R b/R/collapseHz.R
new file mode 100644
index 000000000..3c8d184f8
--- /dev/null
+++ b/R/collapseHz.R
@@ -0,0 +1,63 @@
+#' Collapse Horizons within Profiles Based on Pattern Matching
+#'
+#' Combines and aggregates layers by grouping adjacent horizons that match `pattern` in `hzdesgn`. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group.
+#'
+#' @param x A _SoilProfileCollection_
+#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column
+#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default is estimated using `guessHzDesgnName()`.
+#' @param ignore.case _logical_. If `FALSE`, the pattern matching is case sensitive and if `TRUE`, case is ignored during matching. Default: `FALSE`
+#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for each group and in weighted averages. If `FALSE` `NA` values are propagated to the result. Default: `FALSE`
+#' 
+#' @return A _SoilProfileCollection_
+#' @export
+#'
+#' @examples
+#' data(jacobs2000)
+#' 
+#' a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar"))
+#' b <- jacobs2000
+#' profile_id(a) <- paste0(profile_id(a), "_collapse")
+#' 
+#' plot(c(a, b), color = "clay")
+collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE), ignore.case = FALSE, na.rm = FALSE) {
+  idn <- idname(x)
+  hzd <- horizonDepths(x)
+  if (!is.null(names(pattern))) {
+    labels <- names(pattern)
+    pattern <- as.character(pattern)
+  } else {
+    pattern <- as.character(pattern)
+    labels <- pattern
+  }
+  for (p in seq(pattern)) {
+    h <- data.table::data.table(horizons(x))
+    l <- grepl(pattern[p], h[[hzdesgn]], ignore.case = ignore.case)
+    if (any(l)) {
+      r <- rle(l)
+      g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i])))
+      res <- h[g %in% unique(g[l]), c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), 
+                                           hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)),
+                                           lapply(.SD, \(x, top, bottom) {
+                                             if (is.numeric(x)) {
+                                                weighted.mean(x, bottom - top, na.rm = na.rm)
+                                             } else {
+                                                x[which.max(bottom - top)[1]]
+                                             }
+                                           }, .SD[[hzd[1]]], .SD[[hzd[2]]])), 
+               by = g[g %in% unique(g[l])]]
+      res$g <- NULL
+      res[[hzdesgn]] <- labels[p]
+      h <- h[-which(g %in% unique(g[l])),]
+      h <- data.table::rbindlist(list(h, res), fill = TRUE)
+      h <- h[order(h[[idn]], h[[hzd[1]]]),]
+      hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew)
+      h[[hzd[1]]][hn] <- h$hzdeptnew[hn]
+      h[[hzd[2]]][hn] <- h$hzdepbnew[hn]
+      h$hzdeptnew <- NULL
+      h$hzdepbnew <- NULL
+      replaceHorizons(x) <- h
+    }
+  }
+  x
+}
+
diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd
new file mode 100644
index 000000000..1f461a1ca
--- /dev/null
+++ b/man/collapseHz.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/collapseHz.R
+\name{collapseHz}
+\alias{collapseHz}
+\title{Collapse Horizons within Profiles Based on Pattern Matching}
+\usage{
+collapseHz(
+  x,
+  pattern,
+  hzdesgn = guessHzDesgnName(x, required = TRUE),
+  ignore.case = FALSE,
+  na.rm = FALSE
+)
+}
+\arguments{
+\item{x}{A SoilProfileCollection}
+
+\item{pattern}{character. A regular expression pattern to match in \code{hzdesgn} column}
+
+\item{hzdesgn}{character. Any character column containing horizon-level identifiers. Default is estimated using \code{guessHzDesgnName()}.}
+
+\item{ignore.case}{logical. If \code{FALSE}, the pattern matching is case sensitive and if \code{TRUE}, case is ignored during matching. Default: \code{FALSE}}
+
+\item{na.rm}{logical. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} values are propagated to the result. Default: \code{FALSE}}
+}
+\description{
+Combines and aggregates layers by grouping adjacent horizons that match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group.
+}
+\examples{
+data(jacobs2000)
+
+a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar"))
+b <- jacobs2000
+profile_id(a) <- paste0(profile_id(a), "_collapse")
+
+plot(c(a, b), color = "clay")
+}

From a5c22e32b0caeaa0c8d0d0f2a498cfa51ce4f527 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <brown.andrewg@gmail.com>
Date: Sat, 24 Feb 2024 09:41:45 -0800
Subject: [PATCH 02/22] Add test

---
 tests/testthat/test-collapseHz.R | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 tests/testthat/test-collapseHz.R

diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
new file mode 100644
index 000000000..33d598364
--- /dev/null
+++ b/tests/testthat/test-collapseHz.R
@@ -0,0 +1,10 @@
+test_that("collapseHz works", {
+  data("jacobs2000", package = "aqp")
+  x <- collapseHz(jacobs2000, c(`A` = "^A",
+                                `E` = "E", 
+                                `Bt` = "[ABC]+t", 
+                                `C` = "^C", 
+                                `foo` = "bar"))
+  expect_equal(length(jacobs2000), length(x))
+  expect_equal(nrow(x), 29)
+})

From 71de71c52271613745acb49e54477ac872c1f535 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <brown.andrewg@gmail.com>
Date: Sat, 24 Feb 2024 10:00:28 -0800
Subject: [PATCH 03/22] fun

---
 R/collapseHz.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 3c8d184f8..1d4c45eeb 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -37,7 +37,7 @@ collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE
       g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i])))
       res <- h[g %in% unique(g[l]), c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), 
                                            hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)),
-                                           lapply(.SD, \(x, top, bottom) {
+                                           lapply(.SD, function(x, top, bottom) {
                                              if (is.numeric(x)) {
                                                 weighted.mean(x, bottom - top, na.rm = na.rm)
                                              } else {

From 7b309759838a2529589655a3d17fc06a96940429 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Thu, 10 Oct 2024 16:06:07 -0700
Subject: [PATCH 04/22] implement custom matching function argument `FUN`

---
 R/collapseHz.R    | 41 +++++++++++++++++++++++++++++++++--------
 man/collapseHz.Rd | 43 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 1d4c45eeb..43c8f58a4 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -1,12 +1,28 @@
 #' Collapse Horizons within Profiles Based on Pattern Matching
 #'
-#' Combines and aggregates layers by grouping adjacent horizons that match `pattern` in `hzdesgn`. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group.
+#' Combines and aggregates data for layers by grouping adjacent horizons that
+#' match `pattern` in `hzdesgn`. Numeric properties are combined using the 
+#' weighted average, and other properties are derived from the thickest horizon
+#' in each group.
 #'
 #' @param x A _SoilProfileCollection_
-#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column
-#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default is estimated using `guessHzDesgnName()`.
-#' @param ignore.case _logical_. If `FALSE`, the pattern matching is case sensitive and if `TRUE`, case is ignored during matching. Default: `FALSE`
-#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for each group and in weighted averages. If `FALSE` `NA` values are propagated to the result. Default: `FALSE`
+#' @param pattern _character_. A regular expression pattern to match in `hzdesgn`
+#'  column
+#' @param hzdesgn _character_. Any character column containing horizon-level
+#'  identifiers. Default is estimated using `guessHzDesgnName()`.
+#' @param FUN _function_. A function that returns a _logical_ vector equal in
+#'  length to the number of horizons in `x`. See details.
+#' @param ... Additional arguments passed to the matching function `FUN`.
+#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating
+#'  min/max boundaries for each group and in weighted averages. If `FALSE` `NA`
+#'   values are propagated to the result. Default: `FALSE`
+#' 
+#' @details
+#' 
+#' If a custom function (`FUN`) is used, it should accept arbitrary additional
+#' arguments via an ellipsis (`...`). It is not necessary to do anything with
+#' arguments, but the result should match the number of horizons found in the
+#' input SoilProfileCollection `x`.
 #' 
 #' @return A _SoilProfileCollection_
 #' @export
@@ -14,12 +30,21 @@
 #' @examples
 #' data(jacobs2000)
 #' 
-#' a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar"))
+#' a <- collapseHz(jacobs2000, c(`A` = "^A", 
+#'                               `E` = "E", 
+#'                               `Bt` = "[ABC]+t", 
+#'                               `C` = "^C", 
+#'                               `foo` = "bar"))
 #' b <- jacobs2000
 #' profile_id(a) <- paste0(profile_id(a), "_collapse")
 #' 
 #' plot(c(a, b), color = "clay")
-collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE), ignore.case = FALSE, na.rm = FALSE) {
+collapseHz <- function(x,
+                       pattern,
+                       hzdesgn = hzdesgnname(x, required = TRUE),
+                       FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE),
+                       ...,
+                       na.rm = FALSE) {
   idn <- idname(x)
   hzd <- horizonDepths(x)
   if (!is.null(names(pattern))) {
@@ -31,7 +56,7 @@ collapseHz <- function(x, pattern, hzdesgn = guessHzDesgnName(x, required = TRUE
   }
   for (p in seq(pattern)) {
     h <- data.table::data.table(horizons(x))
-    l <- grepl(pattern[p], h[[hzdesgn]], ignore.case = ignore.case)
+    l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...)
     if (any(l)) {
       r <- rle(l)
       g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i])))
diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd
index 1f461a1ca..e03ed1c8a 100644
--- a/man/collapseHz.Rd
+++ b/man/collapseHz.Rd
@@ -7,29 +7,54 @@
 collapseHz(
   x,
   pattern,
-  hzdesgn = guessHzDesgnName(x, required = TRUE),
-  ignore.case = FALSE,
+  hzdesgn = hzdesgnname(x, required = TRUE),
+  FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case =
+    FALSE),
+  ...,
   na.rm = FALSE
 )
 }
 \arguments{
-\item{x}{A SoilProfileCollection}
+\item{x}{A \emph{SoilProfileCollection}}
 
-\item{pattern}{character. A regular expression pattern to match in \code{hzdesgn} column}
+\item{pattern}{\emph{character}. A regular expression pattern to match in \code{hzdesgn}
+column}
 
-\item{hzdesgn}{character. Any character column containing horizon-level identifiers. Default is estimated using \code{guessHzDesgnName()}.}
+\item{hzdesgn}{\emph{character}. Any character column containing horizon-level
+identifiers. Default is estimated using \code{guessHzDesgnName()}.}
 
-\item{ignore.case}{logical. If \code{FALSE}, the pattern matching is case sensitive and if \code{TRUE}, case is ignored during matching. Default: \code{FALSE}}
+\item{FUN}{\emph{function}. A function that returns a \emph{logical} vector equal in
+length to the number of horizons in \code{x}. See details.}
 
-\item{na.rm}{logical. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA} values are propagated to the result. Default: \code{FALSE}}
+\item{...}{Additional arguments passed to the matching function \code{FUN}.}
+
+\item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating
+min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA}
+values are propagated to the result. Default: \code{FALSE}}
+}
+\value{
+A \emph{SoilProfileCollection}
 }
 \description{
-Combines and aggregates layers by grouping adjacent horizons that match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the weighted average, and other properties are derived from the thickest horizon in each group.
+Combines and aggregates data for layers by grouping adjacent horizons that
+match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the
+weighted average, and other properties are derived from the thickest horizon
+in each group.
+}
+\details{
+If a custom function (\code{FUN}) is used, it should accept arbitrary additional
+arguments via an ellipsis (\code{...}). It is not necessary to do anything with
+arguments, but the result should match the number of horizons found in the
+input SoilProfileCollection \code{x}.
 }
 \examples{
 data(jacobs2000)
 
-a <- collapseHz(jacobs2000, c(`A` = "^A", `E` = "E", `Bt` = "[ABC]+t", `C` = "^C", `foo` = "bar"))
+a <- collapseHz(jacobs2000, c(`A` = "^A", 
+                              `E` = "E", 
+                              `Bt` = "[ABC]+t", 
+                              `C` = "^C", 
+                              `foo` = "bar"))
 b <- jacobs2000
 profile_id(a) <- paste0(profile_id(a), "_collapse")
 

From 494f917822f4d5ef8cbc596478d9b3dac0b13507 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Thu, 10 Oct 2024 16:55:34 -0700
Subject: [PATCH 05/22] update aggregation methods - implement ignoring
 specific numeric columns with `ignore_numerics - add argument`AGGFUN` for
 column name specific aggregations - default for categories is to
 returndominant condition rather than just thickest layer

---
 R/collapseHz.R    | 70 ++++++++++++++++++++++++++++++++++++++++-------
 man/collapseHz.Rd | 34 +++++++++++++++++++++++
 2 files changed, 94 insertions(+), 10 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 43c8f58a4..538121938 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -13,6 +13,16 @@
 #' @param FUN _function_. A function that returns a _logical_ vector equal in
 #'  length to the number of horizons in `x`. See details.
 #' @param ... Additional arguments passed to the matching function `FUN`.
+#' @param AGGFUN _list_. A named list containing custom aggregation functions. 
+#'  List element names should match the column name that they transform. The 
+#'  functions defined should take three arguments: `x` (a vector of horizon 
+#'  property values), `top` (a vector of top depths), and `bottom` (a vector of 
+#'  bottom depths). Default: `NULL` applies weighted.mean() to all numeric
+#'  columns not listed in `ignore_numerics` and takes the thickest value for all
+#'  other columns.
+#' @param ignore_numerics _character_. Vector of column names that contain numeric 
+#'  values which should _not_ be aggregated using `weighted.mean()`. For example,
+#'  soil color "value" and "chroma".
 #' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating
 #'  min/max boundaries for each group and in weighted averages. If `FALSE` `NA`
 #'   values are propagated to the result. Default: `FALSE`
@@ -39,11 +49,33 @@
 #' profile_id(a) <- paste0(profile_id(a), "_collapse")
 #' 
 #' plot(c(a, b), color = "clay")
+#' 
+#' # custom aggregation function for matrix_color_munsell
+#' 
+#' a2 <- collapseHz(jacobs2000, c(`A` = "^A", 
+#'                               `E` = "E", 
+#'                               `Bt` = "[ABC]+t", 
+#'                               `C` = "^C", 
+#'                               `foo` = "bar"), 
+#'                AGGFUN = list(matrix_color_munsell = function(x, top, bottom) {
+#'                               thk <- bottom - top
+#'                               if (length(x) > 1) {
+#'                                xord <- order(thk, decreasing = TRUE)
+#'                                paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ")
+#'                               } else x
+#'                              })
+#'                )
+#' profile_id(a2) <- paste0(profile_id(a), "_collapse_custom")
+#' 
+#' unique(a2$matrix_color_munsell)
+#' 
 collapseHz <- function(x,
                        pattern,
                        hzdesgn = hzdesgnname(x, required = TRUE),
                        FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE),
                        ...,
+                       AGGFUN = NULL,
+                       ignore_numerics = NULL,
                        na.rm = FALSE) {
   idn <- idname(x)
   hzd <- horizonDepths(x)
@@ -60,16 +92,34 @@ collapseHz <- function(x,
     if (any(l)) {
       r <- rle(l)
       g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i])))
-      res <- h[g %in% unique(g[l]), c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), 
-                                           hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)),
-                                           lapply(.SD, function(x, top, bottom) {
-                                             if (is.numeric(x)) {
-                                                weighted.mean(x, bottom - top, na.rm = na.rm)
-                                             } else {
-                                                x[which.max(bottom - top)[1]]
-                                             }
-                                           }, .SD[[hzd[1]]], .SD[[hzd[2]]])), 
-               by = g[g %in% unique(g[l])]]
+      gidx <- g %in% unique(g[l])
+      res <- h[gidx, c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), 
+                            hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)),
+                            sapply(colnames(.SD)[!colnames(.SD) %in% hzd], 
+                                   function(n, top, bottom) {
+                                    v <- .SD[[n]]
+                                    if (n %in% names(AGGFUN)) {
+                                      
+                                      # custom aggregation function (column name specific)
+                                      AGGFUN[[n]](v, top, bottom)
+                                      
+                                    } else if (!n %in% ignore_numerics && is.numeric(x)) {
+                                      
+                                      # weighted average by thickness (numerics not in exclusion list)
+                                      weighted.mean(v, bottom - top, na.rm = na.rm)
+                                      
+                                    } else {
+                                      # take thickest value
+                                      # v[which.max(bottom - top)[1]]
+                                      
+                                      # take dominant condition (based on sum of thickness)
+                                      cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm)
+                                      cond[[1]][which.max(cond[[2]])[1]]
+                                    }
+                                  }, 
+                                  top = .SD[[hzd[1]]], 
+                                  bottom = .SD[[hzd[2]]])), 
+               by = g[gidx]]
       res$g <- NULL
       res[[hzdesgn]] <- labels[p]
       h <- h[-which(g %in% unique(g[l])),]
diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd
index e03ed1c8a..839a3d77a 100644
--- a/man/collapseHz.Rd
+++ b/man/collapseHz.Rd
@@ -11,6 +11,8 @@ collapseHz(
   FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case =
     FALSE),
   ...,
+  AGGFUN = NULL,
+  ignore_numerics = NULL,
   na.rm = FALSE
 )
 }
@@ -28,6 +30,18 @@ length to the number of horizons in \code{x}. See details.}
 
 \item{...}{Additional arguments passed to the matching function \code{FUN}.}
 
+\item{AGGFUN}{\emph{list}. A named list containing custom aggregation functions.
+List element names should match the column name that they transform. The
+functions defined should take three arguments: \code{x} (a vector of horizon
+property values), \code{top} (a vector of top depths), and \code{bottom} (a vector of
+bottom depths). Default: \code{NULL} applies weighted.mean() to all numeric
+columns not listed in \code{ignore_numerics} and takes the thickest value for all
+other columns.}
+
+\item{ignore_numerics}{\emph{character}. Vector of column names that contain numeric
+values which should \emph{not} be aggregated using \code{weighted.mean()}. For example,
+soil color "value" and "chroma".}
+
 \item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating
 min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA}
 values are propagated to the result. Default: \code{FALSE}}
@@ -59,4 +73,24 @@ b <- jacobs2000
 profile_id(a) <- paste0(profile_id(a), "_collapse")
 
 plot(c(a, b), color = "clay")
+
+# custom aggregation function for matrix_color_munsell
+
+a2 <- collapseHz(jacobs2000, c(`A` = "^A", 
+                              `E` = "E", 
+                              `Bt` = "[ABC]+t", 
+                              `C` = "^C", 
+                              `foo` = "bar"), 
+               AGGFUN = list(matrix_color_munsell = function(x, top, bottom) {
+                              thk <- bottom - top
+                              if (length(x) > 1) {
+                               xord <- order(thk, decreasing = TRUE)
+                               paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ")
+                              } else x
+                             })
+               )
+profile_id(a2) <- paste0(profile_id(a), "_collapse_custom")
+
+unique(a2$matrix_color_munsell)
+
 }

From 5339a61e024b057b55f4eca9a1da82c13808460f Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Thu, 10 Oct 2024 19:16:55 -0700
Subject: [PATCH 06/22] update for using existing `GHL()`, refine logic for
 multiple matches per profile, add example

---
 R/collapseHz.R    | 170 +++++++++++++++++++++++++++++-----------------
 man/collapseHz.Rd |  80 ++++++++++++++--------
 2 files changed, 158 insertions(+), 92 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 538121938..a41c0d4a7 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -1,76 +1,97 @@
 #' Collapse Horizons within Profiles Based on Pattern Matching
 #'
 #' Combines and aggregates data for layers by grouping adjacent horizons that
-#' match `pattern` in `hzdesgn`. Numeric properties are combined using the 
+#' match `pattern` in `hzdesgn`. Numeric properties are combined using the
 #' weighted average, and other properties are derived from the thickest horizon
 #' in each group.
 #'
 #' @param x A _SoilProfileCollection_
-#' @param pattern _character_. A regular expression pattern to match in `hzdesgn`
-#'  column
+#' @param pattern _character_. A regular expression pattern to match in
+#' `hzdesgn` column. Default
 #' @param hzdesgn _character_. Any character column containing horizon-level
 #'  identifiers. Default is estimated using `guessHzDesgnName()`.
 #' @param FUN _function_. A function that returns a _logical_ vector equal in
 #'  length to the number of horizons in `x`. See details.
 #' @param ... Additional arguments passed to the matching function `FUN`.
-#' @param AGGFUN _list_. A named list containing custom aggregation functions. 
-#'  List element names should match the column name that they transform. The 
-#'  functions defined should take three arguments: `x` (a vector of horizon 
-#'  property values), `top` (a vector of top depths), and `bottom` (a vector of 
+#' @param AGGFUN _list_. A named list containing custom aggregation functions.
+#'  List element names should match the column name that they transform. The
+#'  functions defined should take three arguments: `x` (a vector of horizon
+#'  property values), `top` (a vector of top depths), and `bottom` (a vector of
 #'  bottom depths). Default: `NULL` applies weighted.mean() to all numeric
 #'  columns not listed in `ignore_numerics` and takes the thickest value for all
 #'  other columns.
-#' @param ignore_numerics _character_. Vector of column names that contain numeric 
-#'  values which should _not_ be aggregated using `weighted.mean()`. For example,
-#'  soil color "value" and "chroma".
+#' @param ignore_numerics _character_. Vector of column names that contain
+#'  numeric  values which should _not_ be aggregated using `weighted.mean()`.
+#'  For example, soil color "value" and "chroma".
 #' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating
 #'  min/max boundaries for each group and in weighted averages. If `FALSE` `NA`
 #'   values are propagated to the result. Default: `FALSE`
-#' 
+#'
 #' @details
-#' 
+#'
 #' If a custom function (`FUN`) is used, it should accept arbitrary additional
 #' arguments via an ellipsis (`...`). It is not necessary to do anything with
 #' arguments, but the result should match the number of horizons found in the
 #' input SoilProfileCollection `x`.
-#' 
+#'
 #' @return A _SoilProfileCollection_
 #' @export
 #'
 #' @examples
 #' data(jacobs2000)
 #' 
-#' a <- collapseHz(jacobs2000, c(`A` = "^A", 
-#'                               `E` = "E", 
-#'                               `Bt` = "[ABC]+t", 
-#'                               `C` = "^C", 
-#'                               `foo` = "bar"))
-#' b <- jacobs2000
-#' profile_id(a) <- paste0(profile_id(a), "_collapse")
-#' 
-#' plot(c(a, b), color = "clay")
+#' # use existing generalized horizon labels
+#' new_labels <- c("A", "E", "Bt", "Bh", "C")
+#' patterns <- c("A", "E", "B.*t", "B.*h", "C")
+#'
+#' # calculate a new SPC with genhz column based on patterns
+#' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
 #' 
-#' # custom aggregation function for matrix_color_munsell
+#' # collapse that SPC based on genhz
+#' i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
 #' 
-#' a2 <- collapseHz(jacobs2000, c(`A` = "^A", 
-#'                               `E` = "E", 
-#'                               `Bt` = "[ABC]+t", 
-#'                               `C` = "^C", 
-#'                               `foo` = "bar"), 
-#'                AGGFUN = list(matrix_color_munsell = function(x, top, bottom) {
-#'                               thk <- bottom - top
-#'                               if (length(x) > 1) {
-#'                                xord <- order(thk, decreasing = TRUE)
-#'                                paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ")
-#'                               } else x
-#'                              })
-#'                )
-#' profile_id(a2) <- paste0(profile_id(a), "_collapse_custom")
+#' profile_id(i) <- paste0(profile_id(i), "_collapse")
+#' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1)
+#'  
+#' # custom pattern argument  
+#'  
+#' j <- collapseHz(jacobs2000,
+#'                 c(
+#'                   `A` = "^A",
+#'                   `E` = "E",
+#'                   `Bt` = "[ABC]+t",
+#'                   `C` = "^C",
+#'                   `foo` = "bar"
+#'                 ))
+#' profile_id(j) <- paste0(profile_id(j), "_collapse")
+#' plot(c(j, jacobs2000), color = "clay")
 #' 
-#' unique(a2$matrix_color_munsell)
+#' # custom aggregation function for matrix_color_munsell
+#' k <- collapseHz(jacobs2000,
+#'                 pattern = c(
+#'                   `A` = "^A",
+#'                   `E` = "E",
+#'                   `Bt` = "[ABC]+t",
+#'                   `C` = "^C",
+#'                   `foo` = "bar"
+#'                 ),
+#'                 AGGFUN = list(
+#'                   matrix_color_munsell = function(x, top, bottom) {
+#'                     thk <- bottom - top
+#'                     if (length(x) > 1) {
+#'                       xord <- order(thk, decreasing = TRUE)
+#'                       paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ")
+#'                     } else
+#'                       x
+#'                   }
+#'                 )
+#'               )
+#' profile_id(k) <- paste0(profile_id(k), "_collapse_custom")
 #' 
+#' unique(k$matrix_color_munsell)
+#
 collapseHz <- function(x,
-                       pattern,
+                       pattern = NULL,
                        hzdesgn = hzdesgnname(x, required = TRUE),
                        FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE),
                        ...,
@@ -79,6 +100,11 @@ collapseHz <- function(x,
                        na.rm = FALSE) {
   idn <- idname(x)
   hzd <- horizonDepths(x)
+  
+  if (is.null(pattern)) {
+    pattern <- unique(as.character(x[[GHL(x, required = TRUE)]]))
+  } 
+  
   if (!is.null(names(pattern))) {
     labels <- names(pattern)
     pattern <- as.character(pattern)
@@ -92,44 +118,64 @@ collapseHz <- function(x,
     if (any(l)) {
       r <- rle(l)
       g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i])))
-      gidx <- g %in% unique(g[l])
-      res <- h[gidx, c(list(hzdeptnew = min(.SD[[hzd[1]]], na.rm = na.rm), 
-                            hzdepbnew = max(.SD[[hzd[2]]], na.rm = na.rm)),
+      hidx <- unlist(sapply(seq(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l
+      gidx <- g %in% unique(g[l]) & !hidx
+      
+      res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), 
+                            hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))),
                             sapply(colnames(.SD)[!colnames(.SD) %in% hzd], 
                                    function(n, top, bottom) {
                                     v <- .SD[[n]]
-                                    if (n %in% names(AGGFUN)) {
-                                      
-                                      # custom aggregation function (column name specific)
-                                      AGGFUN[[n]](v, top, bottom)
-                                      
-                                    } else if (!n %in% ignore_numerics && is.numeric(x)) {
-                                      
-                                      # weighted average by thickness (numerics not in exclusion list)
-                                      weighted.mean(v, bottom - top, na.rm = na.rm)
-                                      
+                                    if (length(v) > 1) {
+                                      if (n %in% names(AGGFUN)) {
+                                        
+                                        # custom aggregation function (column name specific)
+                                        AGGFUN[[n]](v, top, bottom)
+                                        
+                                      } else if (!n %in% ignore_numerics && is.numeric(x)) {
+                                        
+                                        # weighted average by thickness (numerics not in exclusion list)
+                                        weighted.mean(v, bottom - top, na.rm = na.rm)
+                                        
+                                      } else {
+                                        # take thickest value
+                                        # v[which.max(bottom - top)[1]]
+                                        
+                                        # take dominant condition (based on sum of thickness)
+                                        cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm)
+                                        cond[[1]][which.max(cond[[2]])[1]]
+                                      }
                                     } else {
-                                      # take thickest value
-                                      # v[which.max(bottom - top)[1]]
-                                      
-                                      # take dominant condition (based on sum of thickness)
-                                      cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm)
-                                      cond[[1]][which.max(cond[[2]])[1]]
+                                      v
                                     }
                                   }, 
                                   top = .SD[[hzd[1]]], 
                                   bottom = .SD[[hzd[2]]])), 
                by = g[gidx]]
+      
       res$g <- NULL
-      res[[hzdesgn]] <- labels[p]
-      h <- h[-which(g %in% unique(g[l])),]
-      h <- data.table::rbindlist(list(h, res), fill = TRUE)
-      h <- h[order(h[[idn]], h[[hzd[1]]]),]
+      
+      res2 <- h[hidx & l, ]
+      res2$hzdeptnew <- res2[[hzd[1]]]
+      res2$hzdepbnew <- res2[[hzd[2]]]
+      res2[[hzd[1]]] <- NULL
+      res2[[hzd[2]]] <- NULL
+      
+      res3 <- rbind(res, res2)
+      
+      res3[[hzdesgn]] <- labels[p]
+      
+      h <- h[-which(g %in% unique(g[l]) | hidx),]
+      h <- data.table::rbindlist(list(h, res3), fill = TRUE)
+      
       hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew)
       h[[hzd[1]]][hn] <- h$hzdeptnew[hn]
       h[[hzd[2]]][hn] <- h$hzdepbnew[hn]
       h$hzdeptnew <- NULL
       h$hzdepbnew <- NULL
+      
+      h <- h[order(h[[idn]], h[[hzd[1]]]),]
+      
       replaceHorizons(x) <- h
     }
   }
diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd
index 839a3d77a..a7093df8b 100644
--- a/man/collapseHz.Rd
+++ b/man/collapseHz.Rd
@@ -6,7 +6,7 @@
 \usage{
 collapseHz(
   x,
-  pattern,
+  pattern = NULL,
   hzdesgn = hzdesgnname(x, required = TRUE),
   FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case =
     FALSE),
@@ -19,8 +19,8 @@ collapseHz(
 \arguments{
 \item{x}{A \emph{SoilProfileCollection}}
 
-\item{pattern}{\emph{character}. A regular expression pattern to match in \code{hzdesgn}
-column}
+\item{pattern}{\emph{character}. A regular expression pattern to match in
+\code{hzdesgn} column. Default}
 
 \item{hzdesgn}{\emph{character}. Any character column containing horizon-level
 identifiers. Default is estimated using \code{guessHzDesgnName()}.}
@@ -38,9 +38,9 @@ bottom depths). Default: \code{NULL} applies weighted.mean() to all numeric
 columns not listed in \code{ignore_numerics} and takes the thickest value for all
 other columns.}
 
-\item{ignore_numerics}{\emph{character}. Vector of column names that contain numeric
-values which should \emph{not} be aggregated using \code{weighted.mean()}. For example,
-soil color "value" and "chroma".}
+\item{ignore_numerics}{\emph{character}. Vector of column names that contain
+numeric  values which should \emph{not} be aggregated using \code{weighted.mean()}.
+For example, soil color "value" and "chroma".}
 
 \item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating
 min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA}
@@ -64,33 +64,53 @@ input SoilProfileCollection \code{x}.
 \examples{
 data(jacobs2000)
 
-a <- collapseHz(jacobs2000, c(`A` = "^A", 
-                              `E` = "E", 
-                              `Bt` = "[ABC]+t", 
-                              `C` = "^C", 
-                              `foo` = "bar"))
-b <- jacobs2000
-profile_id(a) <- paste0(profile_id(a), "_collapse")
+# use existing generalized horizon labels
+new_labels <- c("A", "E", "Bt", "Bh", "C")
+patterns <- c("A", "E", "B.*t", "B.*h", "C")
 
-plot(c(a, b), color = "clay")
+# calculate a new SPC with genhz column based on patterns
+jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
 
-# custom aggregation function for matrix_color_munsell
+# collapse that SPC based on genhz
+i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
 
-a2 <- collapseHz(jacobs2000, c(`A` = "^A", 
-                              `E` = "E", 
-                              `Bt` = "[ABC]+t", 
-                              `C` = "^C", 
-                              `foo` = "bar"), 
-               AGGFUN = list(matrix_color_munsell = function(x, top, bottom) {
-                              thk <- bottom - top
-                              if (length(x) > 1) {
-                               xord <- order(thk, decreasing = TRUE)
-                               paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ")
-                              } else x
-                             })
-               )
-profile_id(a2) <- paste0(profile_id(a), "_collapse_custom")
+profile_id(i) <- paste0(profile_id(i), "_collapse")
+plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1)
+ 
+# custom pattern argument  
+ 
+j <- collapseHz(jacobs2000,
+                c(
+                  `A` = "^A",
+                  `E` = "E",
+                  `Bt` = "[ABC]+t",
+                  `C` = "^C",
+                  `foo` = "bar"
+                ))
+profile_id(j) <- paste0(profile_id(j), "_collapse")
+plot(c(j, jacobs2000), color = "clay")
 
-unique(a2$matrix_color_munsell)
+# custom aggregation function for matrix_color_munsell
+k <- collapseHz(jacobs2000,
+                pattern = c(
+                  `A` = "^A",
+                  `E` = "E",
+                  `Bt` = "[ABC]+t",
+                  `C` = "^C",
+                  `foo` = "bar"
+                ),
+                AGGFUN = list(
+                  matrix_color_munsell = function(x, top, bottom) {
+                    thk <- bottom - top
+                    if (length(x) > 1) {
+                      xord <- order(thk, decreasing = TRUE)
+                      paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ")
+                    } else
+                      x
+                  }
+                )
+              )
+profile_id(k) <- paste0(profile_id(k), "_collapse_custom")
 
+unique(k$matrix_color_munsell)
 }

From 729e7c33c61c919b1258a1a2a9c94edf5ffec1c8 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Thu, 10 Oct 2024 19:20:12 -0700
Subject: [PATCH 07/22] add test + docs

---
 R/collapseHz.R                   |  7 ++-----
 man/collapseHz.Rd                |  7 ++-----
 tests/testthat/test-collapseHz.R | 15 +++++++++++++++
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index a41c0d4a7..9db153b5b 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -40,21 +40,18 @@
 #' @examples
 #' data(jacobs2000)
 #' 
-#' # use existing generalized horizon labels
+#' # calculate a new SPC with genhz column based on patterns
 #' new_labels <- c("A", "E", "Bt", "Bh", "C")
 #' patterns <- c("A", "E", "B.*t", "B.*h", "C")
-#'
-#' # calculate a new SPC with genhz column based on patterns
 #' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
 #' 
-#' # collapse that SPC based on genhz
+#' # use existing generalized horizon labels
 #' i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
 #' 
 #' profile_id(i) <- paste0(profile_id(i), "_collapse")
 #' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1)
 #'  
 #' # custom pattern argument  
-#'  
 #' j <- collapseHz(jacobs2000,
 #'                 c(
 #'                   `A` = "^A",
diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd
index a7093df8b..fd98894ad 100644
--- a/man/collapseHz.Rd
+++ b/man/collapseHz.Rd
@@ -64,21 +64,18 @@ input SoilProfileCollection \code{x}.
 \examples{
 data(jacobs2000)
 
-# use existing generalized horizon labels
+# calculate a new SPC with genhz column based on patterns
 new_labels <- c("A", "E", "Bt", "Bh", "C")
 patterns <- c("A", "E", "B.*t", "B.*h", "C")
-
-# calculate a new SPC with genhz column based on patterns
 jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
 
-# collapse that SPC based on genhz
+# use existing generalized horizon labels
 i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
 
 profile_id(i) <- paste0(profile_id(i), "_collapse")
 plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1)
  
 # custom pattern argument  
- 
 j <- collapseHz(jacobs2000,
                 c(
                   `A` = "^A",
diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
index 33d598364..222b582e0 100644
--- a/tests/testthat/test-collapseHz.R
+++ b/tests/testthat/test-collapseHz.R
@@ -1,5 +1,20 @@
 test_that("collapseHz works", {
   data("jacobs2000", package = "aqp")
+  .BOTTOM <- NULL
+  
+  # use existing generalized horizon labels
+  new_labels <- c("A", "E", "Bt", "Bh", "C")
+  patterns <- c("A", "E", "B.*t", "B.*h", "C")
+
+  # calculate a new SPC with genhz column based on patterns
+  jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
+
+  # collapse that SPC based on genhz
+  i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
+  expect_equal(length(jacobs2000), length(i))
+  expect_equal(nrow(i), 26)
+  expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152))
+
   x <- collapseHz(jacobs2000, c(`A` = "^A",
                                 `E` = "E", 
                                 `Bt` = "[ABC]+t", 

From 17ed9490aec267dcd57224ab84668cf7c441a865 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Thu, 10 Oct 2024 20:29:02 -0700
Subject: [PATCH 08/22] allow multiple summary statistics in custom aggregation

---
 R/collapseHz.R    | 104 +++++++++++++++++++++++++++++++++++++---------
 man/collapseHz.Rd |  27 ++++++++++++
 2 files changed, 112 insertions(+), 19 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 9db153b5b..63d6b5e6b 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -86,6 +86,34 @@
 #' profile_id(k) <- paste0(profile_id(k), "_collapse_custom")
 #' 
 #' unique(k$matrix_color_munsell)
+#' 
+#' # custom aggregation function for matrix_color_munsell (returns data.frame)
+#' m <- collapseHz(jacobs2000,
+#'                 pattern = c(
+#'                   `A` = "^A",
+#'                   `E` = "E",
+#'                   `Bt` = "[ABC]+t",
+#'                   `C` = "^C",
+#'                   `foo` = "bar"
+#'                 ),
+#'                 AGGFUN = list(
+#'                   matrix_color_munsell = function(x, top, bottom) {
+#'                     thk <- bottom - top
+#'                     if (length(x) > 1) {
+#'                       xord <- order(thk, decreasing = TRUE)
+#'                       data.frame(matrix_color_munsell = paste0(x, collapse = ";"),
+#'                                  n_matrix_color = length(x))
+#'                     } else {
+#'                       data.frame(matrix_color_munsell = x, 
+#'                                  n_matrix_color = length(x))
+#'                     }
+#'                   }
+#'                 )
+#'               )
+#' profile_id(m) <- paste0(profile_id(m), "_collapse_custom")
+#' 
+#' m$matrix_color_munsell.n_matrix_color
+#
 #
 collapseHz <- function(x,
                        pattern = NULL,
@@ -98,46 +126,58 @@ collapseHz <- function(x,
   idn <- idname(x)
   hzd <- horizonDepths(x)
   
-  if (is.null(pattern)) {
-    pattern <- unique(as.character(x[[GHL(x, required = TRUE)]]))
-  } 
+  # use exact match of existing genhz labels as default in lieu of pattern
+  if (is.null(pattern) & missing(matchcolumn)) {
+    existing_genhz <- unique(as.character(x[[GHL(x, required = TRUE)]]))
+    pattern <- paste0("^", existing_genhz, "$")
+    labels <- existing_genhz
+  } else if (!missing(matchcolumn)) {
+    pattern <- NA
+  }
   
+  # if a named vector of patterns is given, use the names as new labels
   if (!is.null(names(pattern))) {
     labels <- names(pattern)
     pattern <- as.character(pattern)
   } else {
+    # otherwise, the patterns and labels are the same
     pattern <- as.character(pattern)
     labels <- pattern
   }
+  
+  # iterate over patterns
   for (p in seq(pattern)) {
+    
     h <- data.table::data.table(horizons(x))
+    
+    # calculate matches
     l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...)
+    
     if (any(l)) {
       r <- rle(l)
       g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i])))
       hidx <- unlist(sapply(seq(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l
       gidx <- g %in% unique(g[l]) & !hidx
+      naf <- names(AGGFUN)
       
+      # iterate over sets of layers needing aggregation within each matching group
       res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), 
                             hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))),
-                            sapply(colnames(.SD)[!colnames(.SD) %in% hzd], 
+                       
+                            # process numeric depth weighted averages w/ dominant condition otherwise                       
+                            sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)],
                                    function(n, top, bottom) {
                                     v <- .SD[[n]]
                                     if (length(v) > 1) {
-                                      if (n %in% names(AGGFUN)) {
-                                        
-                                        # custom aggregation function (column name specific)
-                                        AGGFUN[[n]](v, top, bottom)
-                                        
-                                      } else if (!n %in% ignore_numerics && is.numeric(x)) {
-                                        
+                                      if (!n %in% ignore_numerics && is.numeric(x)) {
+
                                         # weighted average by thickness (numerics not in exclusion list)
                                         weighted.mean(v, bottom - top, na.rm = na.rm)
-                                        
+
                                       } else {
                                         # take thickest value
                                         # v[which.max(bottom - top)[1]]
-                                        
+
                                         # take dominant condition (based on sum of thickness)
                                         cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm)
                                         cond[[1]][which.max(cond[[2]])[1]]
@@ -145,34 +185,60 @@ collapseHz <- function(x,
                                     } else {
                                       v
                                     }
-                                  }, 
-                                  top = .SD[[hzd[1]]], 
-                                  bottom = .SD[[hzd[2]]])), 
+                                  },
+                                  top = .SD[[hzd[1]]],
+                                  bottom = .SD[[hzd[2]]]),
+                       
+                         # process custom aggregation functions (may return data.frames)
+                         do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], 
+                                             function(n, top, bottom) {
+                                                       out <- AGGFUN[[n]](.SD[[n]], top, bottom)
+                                                       if (!is.data.frame(out)) {
+                                                         out <- data.frame(out)
+                                                         colnames(out) <- n
+                                                       } else {
+                                                         colnames(out) <- paste0(n, ".", colnames(out))
+                                                       }
+                                                       out
+                                                                         },
+                                             top = .SD[[hzd[1]]], 
+                                             bottom = .SD[[hzd[2]]]))), 
                by = g[gidx]]
       
-      res$g <- NULL
+      # allow for replacing values as well as adding new values with data.frame AGGFUN
+      test1.idx <- na.omit(match(colnames(res), paste0(naf, ".", naf))) 
+      test2.idx <- na.omit(match(paste0(naf, ".", naf), colnames(res)))
+      colnames(res)[test2.idx] <- naf[test1.idx]
       
+      # remove grouping ID
+      res$g <- NULL
+
+      # determine matches that are only a single layer (no aggregation applied)      
       res2 <- h[hidx & l, ]
       res2$hzdeptnew <- res2[[hzd[1]]]
       res2$hzdepbnew <- res2[[hzd[2]]]
       res2[[hzd[1]]] <- NULL
       res2[[hzd[2]]] <- NULL
       
-      res3 <- rbind(res, res2)
-      
+      # combine matches
+      res3 <- data.table::rbindlist(list(res, res2), fill = TRUE)
       res3[[hzdesgn]] <- labels[p]
       
+      # combine matches with horizons that did not match
       h <- h[-which(g %in% unique(g[l]) | hidx),]
       h <- data.table::rbindlist(list(h, res3), fill = TRUE)
       
+      # replace depths
       hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew)
       h[[hzd[1]]][hn] <- h$hzdeptnew[hn]
       h[[hzd[2]]][hn] <- h$hzdepbnew[hn]
       h$hzdeptnew <- NULL
       h$hzdepbnew <- NULL
       
+      # sort horizons by id name and top depth
       h <- h[order(h[[idn]], h[[hzd[1]]]),]
       
+      # replace horizons in parent SPC
       replaceHorizons(x) <- h
     }
   }
diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd
index fd98894ad..32ea840dd 100644
--- a/man/collapseHz.Rd
+++ b/man/collapseHz.Rd
@@ -110,4 +110,31 @@ k <- collapseHz(jacobs2000,
 profile_id(k) <- paste0(profile_id(k), "_collapse_custom")
 
 unique(k$matrix_color_munsell)
+
+# custom aggregation function for matrix_color_munsell (returns data.frame)
+m <- collapseHz(jacobs2000,
+                pattern = c(
+                  `A` = "^A",
+                  `E` = "E",
+                  `Bt` = "[ABC]+t",
+                  `C` = "^C",
+                  `foo` = "bar"
+                ),
+                AGGFUN = list(
+                  matrix_color_munsell = function(x, top, bottom) {
+                    thk <- bottom - top
+                    if (length(x) > 1) {
+                      xord <- order(thk, decreasing = TRUE)
+                      data.frame(matrix_color_munsell = paste0(x, collapse = ";"),
+                                 n_matrix_color = length(x))
+                    } else {
+                      data.frame(matrix_color_munsell = x, 
+                                 n_matrix_color = length(x))
+                    }
+                  }
+                )
+              )
+profile_id(m) <- paste0(profile_id(m), "_collapse_custom")
+
+m$matrix_color_munsell.n_matrix_color
 }

From 21c3ad83781ff2e35415d6dd6ab985cfa45c2bf4 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Thu, 10 Oct 2024 22:02:40 -0700
Subject: [PATCH 09/22] implement simplified route for existing group IDs or
 labels

---
 R/collapseHz.R | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 63d6b5e6b..27ecafdeb 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -7,9 +7,12 @@
 #'
 #' @param x A _SoilProfileCollection_
 #' @param pattern _character_. A regular expression pattern to match in
-#' `hzdesgn` column. Default
+#' `hzdesgn` column. Default: `NULL`.
+#' @param by _character_. A column name specifying horizons that should be 
+#'  combined. Aggregation will be applied to adjacent groups of layers within
+#'  profiles that have the same value in `by`. 
 #' @param hzdesgn _character_. Any character column containing horizon-level
-#'  identifiers. Default is estimated using `guessHzDesgnName()`.
+#'  identifiers. Default: `hzdesgnname(x, required = TRUE)()`.
 #' @param FUN _function_. A function that returns a _logical_ vector equal in
 #'  length to the number of horizons in `x`. See details.
 #' @param ... Additional arguments passed to the matching function `FUN`.
@@ -46,7 +49,7 @@
 #' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
 #' 
 #' # use existing generalized horizon labels
-#' i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
+#' i <- collapseHz(jacobs2000_gen, by = "genhz")
 #' 
 #' profile_id(i) <- paste0(profile_id(i), "_collapse")
 #' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1)
@@ -113,10 +116,9 @@
 #' profile_id(m) <- paste0(profile_id(m), "_collapse_custom")
 #' 
 #' m$matrix_color_munsell.n_matrix_color
-#
-#
 collapseHz <- function(x,
                        pattern = NULL,
+                       by = NULL,
                        hzdesgn = hzdesgnname(x, required = TRUE),
                        FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE),
                        ...,
@@ -127,11 +129,11 @@ collapseHz <- function(x,
   hzd <- horizonDepths(x)
   
   # use exact match of existing genhz labels as default in lieu of pattern
-  if (is.null(pattern) & missing(matchcolumn)) {
-    existing_genhz <- unique(as.character(x[[GHL(x, required = TRUE)]]))
-    pattern <- paste0("^", existing_genhz, "$")
-    labels <- existing_genhz
-  } else if (!missing(matchcolumn)) {
+  if (is.null(pattern) & missing(by)) {
+    by <- GHL(x, required = TRUE)
+  }
+  
+  if (length(pattern) == 0) {
     pattern <- NA
   }
   
@@ -151,12 +153,18 @@ collapseHz <- function(x,
     h <- data.table::data.table(horizons(x))
     
     # calculate matches
-    l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...)
-    
-    if (any(l)) {
+    if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) {
+      labels <- h[[by]]
+      r <- rle(paste0(h[[idn]], "-", as.character(labels)))
+      l <- rep(TRUE, nrow(h))
+    } else {
+      l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...)
       r <- rle(l)
-      g <- unlist(sapply(seq(r$lengths), function(i) rep(i, r$lengths[i])))
-      hidx <- unlist(sapply(seq(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l
+    }
+    
+    if (any(r$lengths > 1)) {
+      g <- unlist(lapply(seq_along(r$lengths), function(i) rep(i, r$lengths[i])))
+      hidx <- unlist(lapply(seq_along(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l
       gidx <- g %in% unique(g[l]) & !hidx
       naf <- names(AGGFUN)
       
@@ -179,7 +187,7 @@ collapseHz <- function(x,
                                         # v[which.max(bottom - top)[1]]
 
                                         # take dominant condition (based on sum of thickness)
-                                        cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm)
+                                        cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm)
                                         cond[[1]][which.max(cond[[2]])[1]]
                                       }
                                     } else {
@@ -222,7 +230,9 @@ collapseHz <- function(x,
       
       # combine matches
       res3 <- data.table::rbindlist(list(res, res2), fill = TRUE)
-      res3[[hzdesgn]] <- labels[p]
+      if (missing(by)){
+        res3[[hzdesgn]] <- labels[p]
+      }
       
       # combine matches with horizons that did not match
       h <- h[-which(g %in% unique(g[l]) | hidx),]

From 9c3b432b3a2fb15018fd7f6709b122b5be1c1ceb Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 07:02:35 -0700
Subject: [PATCH 10/22] add test of `by` argument and custom AGGFUN with
 data.frame results

---
 R/collapseHz.R                   | 98 +++++++++++++++++---------------
 tests/testthat/test-collapseHz.R | 38 +++++++++++--
 2 files changed, 85 insertions(+), 51 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 27ecafdeb..31bd6392a 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -169,58 +169,61 @@ collapseHz <- function(x,
       naf <- names(AGGFUN)
       
       # iterate over sets of layers needing aggregation within each matching group
-      res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), 
-                            hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))),
-                       
-                            # process numeric depth weighted averages w/ dominant condition otherwise                       
-                            sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)],
-                                   function(n, top, bottom) {
-                                    v <- .SD[[n]]
-                                    if (length(v) > 1) {
-                                      if (!n %in% ignore_numerics && is.numeric(x)) {
-
-                                        # weighted average by thickness (numerics not in exclusion list)
-                                        weighted.mean(v, bottom - top, na.rm = na.rm)
-
+      if (sum(gidx) > 0){
+        res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), 
+                              hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))),
+                         
+                              # process numeric depth weighted averages w/ dominant condition otherwise                       
+                              sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)],
+                                     function(n, top, bottom) {
+                                      v <- .SD[[n]]
+                                      if (length(v) > 1) {
+                                        if (!n %in% ignore_numerics && is.numeric(x)) {
+  
+                                          # weighted average by thickness (numerics not in exclusion list)
+                                          weighted.mean(v, bottom - top, na.rm = na.rm)
+  
+                                        } else {
+                                          # take thickest value
+                                          # v[which.max(bottom - top)[1]]
+  
+                                          # take dominant condition (based on sum of thickness)
+                                          cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm)
+                                          cond[[1]][which.max(cond[[2]])[1]]
+                                        }
                                       } else {
-                                        # take thickest value
-                                        # v[which.max(bottom - top)[1]]
-
-                                        # take dominant condition (based on sum of thickness)
-                                        cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm)
-                                        cond[[1]][which.max(cond[[2]])[1]]
+                                        v
                                       }
-                                    } else {
-                                      v
-                                    }
-                                  },
-                                  top = .SD[[hzd[1]]],
-                                  bottom = .SD[[hzd[2]]]),
-                       
-                         # process custom aggregation functions (may return data.frames)
-                         do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], 
-                                             function(n, top, bottom) {
-                                                       out <- AGGFUN[[n]](.SD[[n]], top, bottom)
-                                                       if (!is.data.frame(out)) {
-                                                         out <- data.frame(out)
-                                                         colnames(out) <- n
-                                                       } else {
-                                                         colnames(out) <- paste0(n, ".", colnames(out))
-                                                       }
-                                                       out
-                                                                         },
-                                             top = .SD[[hzd[1]]], 
-                                             bottom = .SD[[hzd[2]]]))), 
-               by = g[gidx]]
+                                    },
+                                    top = .SD[[hzd[1]]],
+                                    bottom = .SD[[hzd[2]]]),
+                         
+                           # process custom aggregation functions (may return data.frames)
+                           do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], 
+                                               function(n, top, bottom) {
+                                                         out <- AGGFUN[[n]](.SD[[n]], top, bottom)
+                                                         if (!is.data.frame(out)) {
+                                                           out <- data.frame(out)
+                                                           colnames(out) <- n
+                                                         } else {
+                                                           colnames(out) <- paste0(n, ".", colnames(out))
+                                                         }
+                                                         out
+                                                                           },
+                                               top = .SD[[hzd[1]]], 
+                                               bottom = .SD[[hzd[2]]]))), 
+                 by = g[gidx]]
+        # remove grouping ID
+        res$g <- NULL
+      } else {
+        res <- h[0, ]
+      }
       
       # allow for replacing values as well as adding new values with data.frame AGGFUN
       test1.idx <- na.omit(match(colnames(res), paste0(naf, ".", naf))) 
       test2.idx <- na.omit(match(paste0(naf, ".", naf), colnames(res)))
       colnames(res)[test2.idx] <- naf[test1.idx]
       
-      # remove grouping ID
-      res$g <- NULL
-
       # determine matches that are only a single layer (no aggregation applied)      
       res2 <- h[hidx & l, ]
       res2$hzdeptnew <- res2[[hzd[1]]]
@@ -230,12 +233,15 @@ collapseHz <- function(x,
       
       # combine matches
       res3 <- data.table::rbindlist(list(res, res2), fill = TRUE)
-      if (missing(by)){
+      if (missing(by) && nrow(res3) > 0){
         res3[[hzdesgn]] <- labels[p]
       }
       
       # combine matches with horizons that did not match
-      h <- h[-which(g %in% unique(g[l]) | hidx),]
+      agg.idx <- which(g %in% unique(g[l]) | hidx)
+      if (length(agg.idx) > 0) {
+        h <- h[-agg.idx, ]
+      }
       h <- data.table::rbindlist(list(h, res3), fill = TRUE)
       
       # replace depths
diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
index 222b582e0..9ec843617 100644
--- a/tests/testthat/test-collapseHz.R
+++ b/tests/testthat/test-collapseHz.R
@@ -11,15 +11,43 @@ test_that("collapseHz works", {
 
   # collapse that SPC based on genhz
   i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
+  
   expect_equal(length(jacobs2000), length(i))
   expect_equal(nrow(i), 26)
   expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152))
 
-  x <- collapseHz(jacobs2000, c(`A` = "^A",
-                                `E` = "E", 
-                                `Bt` = "[ABC]+t", 
-                                `C` = "^C", 
-                                `foo` = "bar"))
+  i <- collapseHz(jacobs2000_gen, by = "genhz")
+  expect_equal(length(jacobs2000), length(i))
+  expect_equal(nrow(i), 26)
+  expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152))
+  
+  a_pattern <- c(`A` = "^A",
+                 `E` = "E", 
+                 `Bt` = "[ABC]+t", 
+                 `C` = "^C", 
+                 `foo` = "bar")
+  x <- collapseHz(jacobs2000, a_pattern)
   expect_equal(length(jacobs2000), length(x))
   expect_equal(nrow(x), 29)
+  
+  m <- collapseHz(jacobs2000,
+                  pattern = a_pattern,
+                  AGGFUN = list(
+                    matrix_color_munsell = function(x, top, bottom) {
+                      thk <- bottom - top
+                      if (length(x) > 1) {
+                        xord <- order(thk, decreasing = TRUE)
+                        data.frame(matrix_color_munsell = paste0(x, collapse = ";"),
+                                   n_matrix_color = length(x))
+                      } else {
+                        data.frame(matrix_color_munsell = x,
+                                   n_matrix_color = length(x))
+                      }
+                    }
+                  )
+                )
+  profile_id(m) <- paste0(profile_id(m), "_collapse_custom")
+
+  expect_true(all(c("matrix_color_munsell", "matrix_color_munsell.n_matrix_color") %in% names(m)))
+  expect_equal(nrow(m), 29)
 })

From f01ffe2ff783b42985e6e8c49e8dd9bad78f46c5 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 07:20:26 -0700
Subject: [PATCH 11/22] doc

---
 R/collapseHz.R    | 78 ++++++++++++++++++++++++++-----------------
 man/collapseHz.Rd | 84 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 100 insertions(+), 62 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 31bd6392a..ec470ef07 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -1,43 +1,52 @@
 #' Collapse Horizons within Profiles Based on Pattern Matching
 #'
-#' Combines and aggregates data for layers by grouping adjacent horizons that
-#' match `pattern` in `hzdesgn`. Numeric properties are combined using the
-#' weighted average, and other properties are derived from the thickest horizon
-#' in each group.
+#' Combines layers and aggregates data by grouping adjacent horizons which match `pattern` in
+#' `hzdesgn` or, alternately, share a common value in `by` argument. Numeric properties are combined
+#' using the weighted average, and other properties are derived from the dominant condition based on
+#' thickness of layers and values in each group.
 #'
 #' @param x A _SoilProfileCollection_
-#' @param pattern _character_. A regular expression pattern to match in
-#' `hzdesgn` column. Default: `NULL`.
-#' @param by _character_. A column name specifying horizons that should be 
-#'  combined. Aggregation will be applied to adjacent groups of layers within
-#'  profiles that have the same value in `by`. 
-#' @param hzdesgn _character_. Any character column containing horizon-level
-#'  identifiers. Default: `hzdesgnname(x, required = TRUE)()`.
-#' @param FUN _function_. A function that returns a _logical_ vector equal in
-#'  length to the number of horizons in `x`. See details.
+#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column. Default:
+#'   `NULL`.
+#' @param by _character_. A column name specifying horizons that should be combined. Aggregation
+#'   will be applied to adjacent groups of layers within profiles that have the same value in `by`.
+#'   Used in lieu of `pattern` and `hzdesgn`. Default: `NULL`.
+#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default:
+#'   `hzdesgnname(x, required = TRUE)`.
+#' @param FUN _function_. A function that returns a _logical_ vector equal in length to the number
+#'   of horizons in `x`. Used only when `pattern` is specified. See details.
 #' @param ... Additional arguments passed to the matching function `FUN`.
-#' @param AGGFUN _list_. A named list containing custom aggregation functions.
-#'  List element names should match the column name that they transform. The
-#'  functions defined should take three arguments: `x` (a vector of horizon
-#'  property values), `top` (a vector of top depths), and `bottom` (a vector of
-#'  bottom depths). Default: `NULL` applies weighted.mean() to all numeric
-#'  columns not listed in `ignore_numerics` and takes the thickest value for all
-#'  other columns.
-#' @param ignore_numerics _character_. Vector of column names that contain
-#'  numeric  values which should _not_ be aggregated using `weighted.mean()`.
-#'  For example, soil color "value" and "chroma".
-#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating
-#'  min/max boundaries for each group and in weighted averages. If `FALSE` `NA`
-#'   values are propagated to the result. Default: `FALSE`
+#' @param AGGFUN _list_. A _named_ list containing custom aggregation functions. List element names
+#'   should match the column name that they transform. The functions defined should take three
+#'   arguments: `x` (a vector of horizon property values), `top` (a vector of top depths), and
+#'   `bottom` (a vector of bottom depths). Default: `NULL` applies `weighted.mean()` to all numeric
+#'   columns not listed in `ignore_numerics` and takes the dominant condition (value with greatest
+#'   aggregate thickness sum) for all other columns. See details.
+#' @param ignore_numerics _character_. Vector of column names that contain numeric values which
+#'   should _not_ be aggregated using `weighted.mean()`. For example, soil color "value" and
+#'   "chroma".
+#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for
+#'   each group and in weighted averages. If `FALSE` `NA` values are propagated to the result.
+#'   Default: `FALSE`.
 #'
 #' @details
 #'
-#' If a custom function (`FUN`) is used, it should accept arbitrary additional
-#' arguments via an ellipsis (`...`). It is not necessary to do anything with
-#' arguments, but the result should match the number of horizons found in the
-#' input SoilProfileCollection `x`.
+#' If a custom matching function (`FUN`) is used, it should accept arbitrary additional arguments
+#' via an ellipsis (`...`). It is not necessary to do anything with arguments, but the result should
+#' match the number of horizons found in the input SoilProfileCollection `x`.
 #'
+#' Custom aggregation functions defined in the `AGGFUN` argument should either return a single
+#' vector value for each group*column combination, or should return a _data.frame_ object with named
+#' columns. If the input column name is used as a column name in the result _data.frame_, then the
+#' values of that column name in the result _SoilProfileCollection_ will be replaced by the output
+#' of the aggregation function. See examples.
+#' 
 #' @return A _SoilProfileCollection_
+#' 
+#' @author Andrew G. Brown
+#' 
+#' @seealso `hz_dissolve()`
+#' 
 #' @export
 #'
 #' @examples
@@ -52,7 +61,14 @@
 #' i <- collapseHz(jacobs2000_gen, by = "genhz")
 #' 
 #' profile_id(i) <- paste0(profile_id(i), "_collapse")
-#' plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1)
+#' 
+#' plot(
+#'   c(i, jacobs2000),
+#'   color = "genhz",
+#'   name = "name",
+#'   name.style = "center-center",
+#'   cex.names = 1
+#' )
 #'  
 #' # custom pattern argument  
 #' j <- collapseHz(jacobs2000,
diff --git a/man/collapseHz.Rd b/man/collapseHz.Rd
index 32ea840dd..a13a863a7 100644
--- a/man/collapseHz.Rd
+++ b/man/collapseHz.Rd
@@ -7,6 +7,7 @@
 collapseHz(
   x,
   pattern = NULL,
+  by = NULL,
   hzdesgn = hzdesgnname(x, required = TRUE),
   FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case =
     FALSE),
@@ -19,47 +20,55 @@ collapseHz(
 \arguments{
 \item{x}{A \emph{SoilProfileCollection}}
 
-\item{pattern}{\emph{character}. A regular expression pattern to match in
-\code{hzdesgn} column. Default}
+\item{pattern}{\emph{character}. A regular expression pattern to match in \code{hzdesgn} column. Default:
+\code{NULL}.}
 
-\item{hzdesgn}{\emph{character}. Any character column containing horizon-level
-identifiers. Default is estimated using \code{guessHzDesgnName()}.}
+\item{by}{\emph{character}. A column name specifying horizons that should be combined. Aggregation
+will be applied to adjacent groups of layers within profiles that have the same value in \code{by}.
+Used in lieu of \code{pattern} and \code{hzdesgn}. Default: \code{NULL}.}
 
-\item{FUN}{\emph{function}. A function that returns a \emph{logical} vector equal in
-length to the number of horizons in \code{x}. See details.}
+\item{hzdesgn}{\emph{character}. Any character column containing horizon-level identifiers. Default:
+\code{hzdesgnname(x, required = TRUE)}.}
+
+\item{FUN}{\emph{function}. A function that returns a \emph{logical} vector equal in length to the number
+of horizons in \code{x}. Used only when \code{pattern} is specified. See details.}
 
 \item{...}{Additional arguments passed to the matching function \code{FUN}.}
 
-\item{AGGFUN}{\emph{list}. A named list containing custom aggregation functions.
-List element names should match the column name that they transform. The
-functions defined should take three arguments: \code{x} (a vector of horizon
-property values), \code{top} (a vector of top depths), and \code{bottom} (a vector of
-bottom depths). Default: \code{NULL} applies weighted.mean() to all numeric
-columns not listed in \code{ignore_numerics} and takes the thickest value for all
-other columns.}
-
-\item{ignore_numerics}{\emph{character}. Vector of column names that contain
-numeric  values which should \emph{not} be aggregated using \code{weighted.mean()}.
-For example, soil color "value" and "chroma".}
-
-\item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating
-min/max boundaries for each group and in weighted averages. If \code{FALSE} \code{NA}
-values are propagated to the result. Default: \code{FALSE}}
+\item{AGGFUN}{\emph{list}. A \emph{named} list containing custom aggregation functions. List element names
+should match the column name that they transform. The functions defined should take three
+arguments: \code{x} (a vector of horizon property values), \code{top} (a vector of top depths), and
+\code{bottom} (a vector of bottom depths). Default: \code{NULL} applies \code{weighted.mean()} to all numeric
+columns not listed in \code{ignore_numerics} and takes the dominant condition (value with greatest
+aggregate thickness sum) for all other columns. See details.}
+
+\item{ignore_numerics}{\emph{character}. Vector of column names that contain numeric values which
+should \emph{not} be aggregated using \code{weighted.mean()}. For example, soil color "value" and
+"chroma".}
+
+\item{na.rm}{\emph{logical}. If \code{TRUE} \code{NA} values are ignored when calculating min/max boundaries for
+each group and in weighted averages. If \code{FALSE} \code{NA} values are propagated to the result.
+Default: \code{FALSE}.}
 }
 \value{
 A \emph{SoilProfileCollection}
 }
 \description{
-Combines and aggregates data for layers by grouping adjacent horizons that
-match \code{pattern} in \code{hzdesgn}. Numeric properties are combined using the
-weighted average, and other properties are derived from the thickest horizon
-in each group.
+Combines layers and aggregates data by grouping adjacent horizons which match \code{pattern} in
+\code{hzdesgn} or, alternately, share a common value in \code{by} argument. Numeric properties are combined
+using the weighted average, and other properties are derived from the dominant condition based on
+thickness of layers and values in each group.
 }
 \details{
-If a custom function (\code{FUN}) is used, it should accept arbitrary additional
-arguments via an ellipsis (\code{...}). It is not necessary to do anything with
-arguments, but the result should match the number of horizons found in the
-input SoilProfileCollection \code{x}.
+If a custom matching function (\code{FUN}) is used, it should accept arbitrary additional arguments
+via an ellipsis (\code{...}). It is not necessary to do anything with arguments, but the result should
+match the number of horizons found in the input SoilProfileCollection \code{x}.
+
+Custom aggregation functions defined in the \code{AGGFUN} argument should either return a single
+vector value for each group*column combination, or should return a \emph{data.frame} object with named
+columns. If the input column name is used as a column name in the result \emph{data.frame}, then the
+values of that column name in the result \emph{SoilProfileCollection} will be replaced by the output
+of the aggregation function. See examples.
 }
 \examples{
 data(jacobs2000)
@@ -70,10 +79,17 @@ patterns <- c("A", "E", "B.*t", "B.*h", "C")
 jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
 
 # use existing generalized horizon labels
-i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
+i <- collapseHz(jacobs2000_gen, by = "genhz")
 
 profile_id(i) <- paste0(profile_id(i), "_collapse")
-plot(c(i, jacobs2000), color = "genhz", name = "name", name.style = "center-center", cex.names = 1)
+
+plot(
+  c(i, jacobs2000),
+  color = "genhz",
+  name = "name",
+  name.style = "center-center",
+  cex.names = 1
+)
  
 # custom pattern argument  
 j <- collapseHz(jacobs2000,
@@ -138,3 +154,9 @@ profile_id(m) <- paste0(profile_id(m), "_collapse_custom")
 
 m$matrix_color_munsell.n_matrix_color
 }
+\seealso{
+\code{hz_dissolve()}
+}
+\author{
+Andrew G. Brown
+}

From 009801059e1e3990c9417206ec31b54328fa529c Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 09:15:12 -0700
Subject: [PATCH 12/22] add comment about when aggregation is used

---
 R/collapseHz.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index ec470ef07..b08fb1ff1 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -178,6 +178,7 @@ collapseHz <- function(x,
       r <- rle(l)
     }
     
+    # only apply aggregation if there are adjacent horizons that match the target criteria
     if (any(r$lengths > 1)) {
       g <- unlist(lapply(seq_along(r$lengths), function(i) rep(i, r$lengths[i])))
       hidx <- unlist(lapply(seq_along(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l

From 424ebbe86f4f6e4b112691f3481a5d50a6c61d3d Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 10:19:54 -0700
Subject: [PATCH 13/22] move horizon extraction and replacement outside the
 loop

---
 R/collapseHz.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index b08fb1ff1..9492bfc87 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -163,11 +163,11 @@ collapseHz <- function(x,
     labels <- pattern
   }
   
+  h <- data.table::data.table(horizons(x))
+  
   # iterate over patterns
   for (p in seq(pattern)) {
     
-    h <- data.table::data.table(horizons(x))
-    
     # calculate matches
     if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) {
       labels <- h[[by]]
@@ -271,9 +271,10 @@ collapseHz <- function(x,
       # sort horizons by id name and top depth
       h <- h[order(h[[idn]], h[[hzd[1]]]),]
       
-      # replace horizons in parent SPC
-      replaceHorizons(x) <- h
     }
+    
+    # replace horizons in parent SPC
+    replaceHorizons(x) <- h
   }
   x
 }

From 489d2a815c7c49c56453f965da15a20c7358b8e1 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 13:11:32 -0700
Subject: [PATCH 14/22] fix issue with aggregated colums always returning
 character + test

---
 R/collapseHz.R                   | 77 ++++++++++++++++----------------
 tests/testthat/test-collapseHz.R | 11 ++++-
 2 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 9492bfc87..868712f90 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -192,43 +192,44 @@ collapseHz <- function(x,
                          
                               # process numeric depth weighted averages w/ dominant condition otherwise                       
                               sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)],
-                                     function(n, top, bottom) {
-                                      v <- .SD[[n]]
-                                      if (length(v) > 1) {
-                                        if (!n %in% ignore_numerics && is.numeric(x)) {
-  
-                                          # weighted average by thickness (numerics not in exclusion list)
-                                          weighted.mean(v, bottom - top, na.rm = na.rm)
-  
-                                        } else {
-                                          # take thickest value
-                                          # v[which.max(bottom - top)[1]]
-  
-                                          # take dominant condition (based on sum of thickness)
-                                          cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm)
-                                          cond[[1]][which.max(cond[[2]])[1]]
+                                       function(n, top, bottom) {
+                                        v <- .SD[[n]]
+                                        if (length(v) > 1) {
+                                          if (!n %in% ignore_numerics && is.numeric(v)) {
+    
+                                            # weighted average by thickness (numerics not in exclusion list)
+                                            v <- weighted.mean(v, bottom - top, na.rm = na.rm)
+    
+                                          } else {
+                                            # take thickest value
+                                            # v[which.max(bottom - top)[1]]
+    
+                                            # take dominant condition (based on sum of thickness)
+                                            cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm)
+                                            v <- cond[[1]][which.max(cond[[2]])[1]]
+                                          }
                                         }
-                                      } else {
-                                        v
-                                      }
-                                    },
-                                    top = .SD[[hzd[1]]],
-                                    bottom = .SD[[hzd[2]]]),
+                                        out <- data.frame(v)
+                                        colnames(out) <- n
+                                        out
+                                      },
+                                      top = .SD[[hzd[1]]],
+                                      bottom = .SD[[hzd[2]]]),
                          
-                           # process custom aggregation functions (may return data.frames)
-                           do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], 
-                                               function(n, top, bottom) {
-                                                         out <- AGGFUN[[n]](.SD[[n]], top, bottom)
-                                                         if (!is.data.frame(out)) {
-                                                           out <- data.frame(out)
-                                                           colnames(out) <- n
-                                                         } else {
-                                                           colnames(out) <- paste0(n, ".", colnames(out))
-                                                         }
-                                                         out
-                                                                           },
-                                               top = .SD[[hzd[1]]], 
-                                               bottom = .SD[[hzd[2]]]))), 
+                             # process custom aggregation functions (may return data.frames)
+                             do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], 
+                                                 function(n, top, bottom) {
+                                                           out <- AGGFUN[[n]](.SD[[n]], top, bottom)
+                                                           if (!is.data.frame(out)) {
+                                                             out <- data.frame(out)
+                                                             colnames(out) <- n
+                                                           } else {
+                                                             colnames(out) <- paste0(n, ".", colnames(out))
+                                                           }
+                                                           out
+                                                                             },
+                                                 top = .SD[[hzd[1]]], 
+                                                 bottom = .SD[[hzd[2]]]))), 
                  by = g[gidx]]
         # remove grouping ID
         res$g <- NULL
@@ -237,9 +238,9 @@ collapseHz <- function(x,
       }
       
       # allow for replacing values as well as adding new values with data.frame AGGFUN
-      test1.idx <- na.omit(match(colnames(res), paste0(naf, ".", naf))) 
-      test2.idx <- na.omit(match(paste0(naf, ".", naf), colnames(res)))
-      colnames(res)[test2.idx] <- naf[test1.idx]
+      test1.idx <- na.omit(match(colnames(res), paste0(colnames(h), ".", colnames(h)))) 
+      test2.idx <- na.omit(match(paste0(colnames(h), ".", colnames(h)), colnames(res)))
+      colnames(res)[test2.idx] <- colnames(h)[test1.idx]
       
       # determine matches that are only a single layer (no aggregation applied)      
       res2 <- h[hidx & l, ]
diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
index 9ec843617..f0ada978d 100644
--- a/tests/testthat/test-collapseHz.R
+++ b/tests/testthat/test-collapseHz.R
@@ -11,15 +11,23 @@ test_that("collapseHz works", {
 
   # collapse that SPC based on genhz
   i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
-  
   expect_equal(length(jacobs2000), length(i))
   expect_equal(nrow(i), 26)
   expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152))
 
+  # collapses adjacent horizons with same label
   i <- collapseHz(jacobs2000_gen, by = "genhz")
+  
+  # no effect, horizon designations are unique within profiles
+  j <- collapseHz(jacobs2000_gen, by = "name")
+  
   expect_equal(length(jacobs2000), length(i))
   expect_equal(nrow(i), 26)
+  expect_equal(nrow(j), 46)
   expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152))
+  expect_equal(j[7, , .BOTTOM], jacobs2000[7, , .BOTTOM])
+  expect_true(is.numeric(i$clay))
+  expect_true(is.numeric(j$clay))
   
   a_pattern <- c(`A` = "^A",
                  `E` = "E", 
@@ -29,6 +37,7 @@ test_that("collapseHz works", {
   x <- collapseHz(jacobs2000, a_pattern)
   expect_equal(length(jacobs2000), length(x))
   expect_equal(nrow(x), 29)
+  expect_true(is.numeric(x$clay))
   
   m <- collapseHz(jacobs2000,
                   pattern = a_pattern,

From 5bcac0734a6b941065f0c34e630b8e29c0c96bb4 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 13:54:19 -0700
Subject: [PATCH 15/22] add weighted average and dominant condition tests, with
 and without NA

---
 tests/testthat/test-collapseHz.R | 36 ++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
index f0ada978d..5893ef955 100644
--- a/tests/testthat/test-collapseHz.R
+++ b/tests/testthat/test-collapseHz.R
@@ -9,6 +9,9 @@ test_that("collapseHz works", {
   # calculate a new SPC with genhz column based on patterns
   jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
 
+  # create a missing value
+  jacobs2000_gen$clay[19] <- NA
+  
   # collapse that SPC based on genhz
   i <- collapseHz(jacobs2000_gen, hzdesgn = "genhz")
   expect_equal(length(jacobs2000), length(i))
@@ -17,15 +20,44 @@ test_that("collapseHz works", {
 
   # collapses adjacent horizons with same label
   i <- collapseHz(jacobs2000_gen, by = "genhz")
+  ii <- collapseHz(jacobs2000_gen, by = "genhz", na.rm = TRUE)
   
   # no effect, horizon designations are unique within profiles
   j <- collapseHz(jacobs2000_gen, by = "name")
   
+  expect_equal(nrow(j), 46)
+  expect_equal(j[7, , .BOTTOM], jacobs2000[7, , .BOTTOM])
+  
+  # if using `by` argument, all values must not be NA
+  expect_error(collapseHz(jacobs2000_gen, by = "matrix_color_munsell"),
+               "Missing values are not allowed")
+  
+  # matches input number of profiles
   expect_equal(length(jacobs2000), length(i))
+  
+  # horizons have been collapsed
   expect_equal(nrow(i), 26)
-  expect_equal(nrow(j), 46)
+  
+  # weighted mean (no NA values) works as expected (clay=47.15)
+  expect_equal(i$clay[4],
+               weighted.mean(jacobs2000_gen$clay[6:7], (jacobs2000_gen$bottom - jacobs2000_gen$top)[6:7]))
+  
+  # weighted mean (contains NA values, na.rm=FALSE) (clay is NA)
+  expect_true(is.na(i$clay[11]))
+  
+  # weighted mean (contains NA values, na.rm=TRUE, clay=18.72414)
+  expect_equal(ii$clay[11],
+               weighted.mean(jacobs2000_gen$clay[17:20], (jacobs2000_gen$bottom - jacobs2000_gen$top)[17:20], na.rm = TRUE))
+  
+  # dominant condition (NA values retained)
+  expect_true(is.na(i$depletion_munsell[13]))
+  
+  # dominant condition (NA values removed)
+  expect_equal(ii$depletion_munsell[13], "10YR 8/2")
+  
+  plot(jacobs2000_gen, color = "concentration_pct")
+  
   expect_equal(i[7, , .BOTTOM], c(15, 41, 61, 132, 140, 152))
-  expect_equal(j[7, , .BOTTOM], jacobs2000[7, , .BOTTOM])
   expect_true(is.numeric(i$clay))
   expect_true(is.numeric(j$clay))
   

From 268497df13e5151ffc075beda411e7a6306d091c Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 13:54:43 -0700
Subject: [PATCH 16/22] NA in `by` argument not allowed

---
 R/collapseHz.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 868712f90..a7c9f8137 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -171,6 +171,11 @@ collapseHz <- function(x,
     # calculate matches
     if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) {
       labels <- h[[by]]
+      
+      if (any(is.na(labels))) {
+        stop("Missing values are not allowed in `by` column argument", call. = FALSE)
+      }
+      
       r <- rle(paste0(h[[idn]], "-", as.character(labels)))
       l <- rep(TRUE, nrow(h))
     } else {

From 1a848832eb4c6b8b6b70d06eccc786b59a5f1749 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 13:55:20 -0700
Subject: [PATCH 17/22] handle NA values (when `na.rm=FALSE`) in aggregation of
 thickness for dominant condition

---
 R/collapseHz.R | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index a7c9f8137..5e24de0ac 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -209,9 +209,22 @@ collapseHz <- function(x,
                                             # take thickest value
                                             # v[which.max(bottom - top)[1]]
     
+                                            # convert factors etc to character
+                                            # results may not conform with existing factor levels
+                                            v <- as.character(v)
+                                            
+                                            # replace NA values for use in aggregate()
+                                            if (!na.rm) {
+                                              v[is.na(v)] <- "<collapseHZ-category-missing>"
+                                            } 
+                                            
                                             # take dominant condition (based on sum of thickness)
-                                            cond <- aggregate(bottom - top, by = list(as.character(v)), sum, na.rm = na.rm)
+                                            cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm)
                                             v <- cond[[1]][which.max(cond[[2]])[1]]
+                                            
+                                            if (!na.rm) {
+                                              v[v == "<collapseHZ-category-missing>"] <- NA
+                                            }
                                           }
                                         }
                                         out <- data.frame(v)

From 2902a8a8b922301e5868252ddf8792fdfb233e0d Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 14:10:59 -0700
Subject: [PATCH 18/22] tests of empty SPC, filled SPC, missing `by` column

---
 R/collapseHz.R                   |  5 +++++
 tests/testthat/test-collapseHz.R | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/R/collapseHz.R b/R/collapseHz.R
index 5e24de0ac..3e56dbb88 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -170,6 +170,11 @@ collapseHz <- function(x,
     
     # calculate matches
     if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) {
+      
+      if (!by %in% horizonNames(x)) {
+        stop("Column name `by` (\"", by, ") is not a horizon-level variable.", call. = FALSE) 
+      }
+      
       labels <- h[[by]]
       
       if (any(is.na(labels))) {
diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
index 5893ef955..d186a8c05 100644
--- a/tests/testthat/test-collapseHz.R
+++ b/tests/testthat/test-collapseHz.R
@@ -32,6 +32,9 @@ test_that("collapseHz works", {
   expect_error(collapseHz(jacobs2000_gen, by = "matrix_color_munsell"),
                "Missing values are not allowed")
   
+  # `by` column must also be a horizon-level variable
+  expect_error(collapseHz(jacobs2000, by = "genhz"), "not a horizon-level variable")
+  
   # matches input number of profiles
   expect_equal(length(jacobs2000), length(i))
   
@@ -61,6 +64,19 @@ test_that("collapseHz works", {
   expect_true(is.numeric(i$clay))
   expect_true(is.numeric(j$clay))
   
+  # "works" on empty SPC ()
+  expect_equal(nrow(collapseHz(jacobs2000_gen[0,], by = "genhz")), 0)
+                
+  # works on SPC with filled profile (1 horizon with NA depths)
+  all_na <- subsetHz(jacobs2000_gen[1,], TRUE)
+  all_na$top <- NA
+  all_na$bottom <- NA
+  expect_warning(na_nonna <- c(all_na, jacobs2000_gen[2:5,]))
+  expect_silent(f <- collapseHz(all_na, by = "genhz"))
+  expect_silent(n <- collapseHz(na_nonna, by = "genhz"))
+  expect_equal(nrow(n), 14)
+
+  
   a_pattern <- c(`A` = "^A",
                  `E` = "E", 
                  `Bt` = "[ABC]+t", 

From 80dc40f9f70f7516dffdbf9876e266a061633d77 Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 14:49:33 -0700
Subject: [PATCH 19/22] add depth screening function to guide user to
 `checkHzDepthLogic()`

---
 R/SoilProfileCollection-setters.R | 15 +++++++++++++++
 R/collapseHz.R                    |  2 ++
 tests/testthat/test-collapseHz.R  |  9 +++++----
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/R/SoilProfileCollection-setters.R b/R/SoilProfileCollection-setters.R
index f05bc454d..0322fc002 100644
--- a/R/SoilProfileCollection-setters.R
+++ b/R/SoilProfileCollection-setters.R
@@ -90,6 +90,18 @@ setReplaceMethod("depths", "data.frame",
   return(depth)
 }
 
+.checkDepthOrder <- function(x, depthcols) {
+  if (any(x[[depthcols[2]]] < x[[depthcols[1]]], na.rm = TRUE)) {
+    warning("One or more horizon bottom depths are shallower than top depth. Check depth logic with aqp::checkHzDepthLogic()", call. = FALSE)
+  }
+}
+
+.screenDepths <- function(x, depthcols = horizonDepths(x)) {
+  .checkNAdepths(x[[depthcols[1]]], "top")
+  .checkNAdepths(x[[depthcols[2]]], "bottom")
+  .checkDepthOrder(x, depthcols)
+}
+
 # create 0-length spc from id and horizon depth columns (`idn`, `hzd`)
 #  - allows template horizon (`hz`) and site (`st`) data to be provided (for additional columns)
 .prototypeSPC <- function(idn, hzd, 
@@ -178,6 +190,9 @@ setReplaceMethod("depths", "data.frame",
   data[[depthcols[1]]] <- .checkNAdepths(data[[depthcols[1]]], "top")
   data[[depthcols[2]]] <- .checkNAdepths(data[[depthcols[2]]], "bottom")
   
+  # warn if bottom depth shallower than top (old style O horizons, data entry issues, etc.)
+  .checkDepthOrder(data, depthcols)
+  
   tdep <- data[[depthcols[1]]]
 
   # calculate ID-top depth order, re-order input data
diff --git a/R/collapseHz.R b/R/collapseHz.R
index 3e56dbb88..2f6cc52a8 100644
--- a/R/collapseHz.R
+++ b/R/collapseHz.R
@@ -144,6 +144,8 @@ collapseHz <- function(x,
   idn <- idname(x)
   hzd <- horizonDepths(x)
   
+  .screenDepths(x, hzd)
+  
   # use exact match of existing genhz labels as default in lieu of pattern
   if (is.null(pattern) & missing(by)) {
     by <- GHL(x, required = TRUE)
diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
index d186a8c05..58c946234 100644
--- a/tests/testthat/test-collapseHz.R
+++ b/tests/testthat/test-collapseHz.R
@@ -69,11 +69,12 @@ test_that("collapseHz works", {
                 
   # works on SPC with filled profile (1 horizon with NA depths)
   all_na <- subsetHz(jacobs2000_gen[1,], TRUE)
-  all_na$top <- NA
-  all_na$bottom <- NA
+  all_na$top <- NA_real_
+  all_na$bottom <- NA_real_
   expect_warning(na_nonna <- c(all_na, jacobs2000_gen[2:5,]))
-  expect_silent(f <- collapseHz(all_na, by = "genhz"))
-  expect_silent(n <- collapseHz(na_nonna, by = "genhz"))
+  expect_warning(f <- collapseHz(all_na, by = "genhz"), "contain NA")
+  na_nonna$top[2] <- 19
+  expect_warning(n <- collapseHz(na_nonna, by = "genhz"), "bottom depths are shallower than top")
   expect_equal(nrow(n), 14)
 
   

From d9a567bcd28acff54de513b01716782aa0fd2e7b Mon Sep 17 00:00:00 2001
From: Beaudette <dylan.beaudette@usda.gov>
Date: Fri, 11 Oct 2024 15:04:48 -0700
Subject: [PATCH 20/22] test context

---
 tests/testthat/test-collapseHz.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/testthat/test-collapseHz.R b/tests/testthat/test-collapseHz.R
index 58c946234..2fbfb3e14 100644
--- a/tests/testthat/test-collapseHz.R
+++ b/tests/testthat/test-collapseHz.R
@@ -1,3 +1,5 @@
+context("collapseHz()")
+
 test_that("collapseHz works", {
   data("jacobs2000", package = "aqp")
   .BOTTOM <- NULL

From 73c848bfe95f9c8729ef126a162600c8958e9fcd Mon Sep 17 00:00:00 2001
From: Andrew Gene Brown <andrew.g.brown@usda.gov>
Date: Fri, 11 Oct 2024 15:14:21 -0700
Subject: [PATCH 21/22] Update NEWS

---
 NEWS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 1f8ab529d..a8ae9c372 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,7 +3,8 @@
  * `munsell2rgb()` now safely selects the closest Munsell value and chroma to those available in the package LUT 
  * new function `soilTextureColorPal()` for suggesting a color palette suitable for soil texture class
  * **Breaking Change**: `@sp` slot of the SoilProfileCollection object, and dependency on sp package, has been removed. 
-  * Any SoilProfileCollection objects previously written to file (.rda, .rds) with aqp <2.1.x will need to be rebuilt using `rebuildSPC()` due to changes to S4 object structure
+   * Any SoilProfileCollection objects previously written to file (.rda, .rds) with aqp <2.1.x will need to be rebuilt using `rebuildSPC()` due to changes to S4 object structure
+ * new function `collapseHz()` combines and aggregates data for adjacent horizons matching a pattern or sharing a common ID
 
 # aqp 2.0.4 (2024-07-30)
  * CRAN release

From 23b67f3a555204c5e06adc3ab2ce33e69ae90219 Mon Sep 17 00:00:00 2001
From: Beaudette <dylan.beaudette@usda.gov>
Date: Fri, 11 Oct 2024 15:34:42 -0700
Subject: [PATCH 22/22] Create collapseHz-mixMunsell-examples.R

---
 misc/sandbox/collapseHz-mixMunsell-examples.R | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 misc/sandbox/collapseHz-mixMunsell-examples.R

diff --git a/misc/sandbox/collapseHz-mixMunsell-examples.R b/misc/sandbox/collapseHz-mixMunsell-examples.R
new file mode 100644
index 000000000..c22448a3d
--- /dev/null
+++ b/misc/sandbox/collapseHz-mixMunsell-examples.R
@@ -0,0 +1,94 @@
+library(aqp)
+
+# example data
+data("jacobs2000")
+
+# local copy
+g <- jacobs2000
+
+# spike some horizon colors with green / blue hues
+g$matrix_color_munsell[4] <- '5G 4/6'
+g$matrix_color_munsell[29] <- '5B 4/6'
+g$matrix_color_munsell[36] <- '5R 4/6'
+
+# horizon correlation patterns
+# applied to horizon desingation
+a_pattern <- c(`A` = "^A",
+               `E` = "E", 
+               `Bt` = "[B]+t", 
+               `Bh` = "[B]+h", 
+               `C` = "^C", 
+               `foo` = "bar")
+
+
+# safe wrapper around mixMunsell()
+mixFun <- function(x, top, bottom) {
+  # weights
+  w <- bottom - top
+  
+  # index to non-NA values
+  .idx <- which(! is.na(x))
+  .n <- length(x[.idx])
+  
+  # if all NA, return NA
+  if(.n < 1) {
+    return(NA)
+    
+    # if only a single color, return that  
+  } else if (.n == 1){
+    print('just 1!')
+    return(x[.idx])
+    
+  } else {
+    # mix colors, retain only munsell notation
+    .res <- mixMunsell(x[.idx], w[.idx], mixingMethod = 'exact')$munsell 
+    return(.res)
+  }
+}
+
+# collapse according to patterns
+m <- collapseHz(g,
+                pattern = a_pattern,
+                AGGFUN = list(
+                  matrix_color_munsell = mixFun
+                )
+)
+
+# new profile IDs so we can safely combine with source data
+profile_id(m) <- sprintf("%s-c", profile_id(m))
+
+# combine
+z <- c(g, m)
+
+# convert Munsell colors -> sRGB in hex notation
+z$soilcolor <- parseMunsell(z$matrix_color_munsell)
+
+# plot combined collection
+par(mar = c(0, 0, 0, 3))
+plotSPC(z, color = 'soilcolor', name = 'name', name.style = 'center-center', width = 0.35, cex.names = 0.75)
+
+## start fresh
+
+# combine all horizons by profile
+
+g <- jacobs2000
+horizons(g)$.all <- 'soil'
+collapseHz(g, by = '.all')
+
+
+m <- collapseHz(g,
+                by = '.all',
+                AGGFUN = list(
+                  matrix_color_munsell = mixFun
+                )
+)
+
+profile_id(m) <- sprintf("%s-c", profile_id(m))
+z <- c(g, m)
+z$soilcolor <- parseMunsell(z$matrix_color_munsell)
+
+# neat
+par(mar = c(0, 0, 0, 3))
+plotSPC(z, color = 'soilcolor', name = 'name', name.style = 'center-center', width = 0.35, cex.names = 0.75)
+
+