From 960ff28b76387ee152d48b4bfc27198f53f070d3 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Tue, 24 Sep 2024 13:52:41 -0400
Subject: [PATCH] Refactor internal helpers which generate fragmented messages
 (#6534)

* Refacator internal helpers which generate fragmented messages

* Update test error strings

* deprecate internal group.desc argument
---
 R/fmelt.R             | 35 ++++++++++++++---------------------
 R/utils.R             | 10 ++++++++--
 inst/tests/tests.Rraw | 10 +++++-----
 man/measure.Rd        |  4 +---
 4 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/R/fmelt.R b/R/fmelt.R
index 3f924c2e4..55a6fb0ad 100644
--- a/R/fmelt.R
+++ b/R/fmelt.R
@@ -64,14 +64,11 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") {
     }
     fun.list[[fun.i]] = fun
   }
-  measurev.args = c(
-    list(fun.list),
-    L[formal.i.vec],
-    list(group.desc="... arguments to measure"))
+  measurev.args = c(list(fun.list), L[formal.i.vec])
   do.call(measurev, measurev.args)
 }
 
-measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name", group.desc="elements of fun.list"){
+measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name"){
   # 1. basic error checking.
   if (!missing(sep) && !missing(pattern)) {
     stopf("both sep and pattern arguments used; must use either sep or pattern (not both)")
@@ -88,21 +85,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
     which(!nzchar(names(fun.list)))
   }
   if (length(prob.i)) {
-    stopf("in measurev, %s must be named, problems: %s", group.desc, brackify(prob.i))
+    stopf("in measurev, elements of fun.list must be named, problems: %s", brackify(prob.i))
   }
-  err.names.unique = function(err.what, name.vec) {
-    name.tab = table(name.vec)
-    bad.counts = name.tab[1 < name.tab]
-    if (length(bad.counts)) {
-      stopf("%s should be uniquely named, problems: %s", err.what, brackify(names(bad.counts)))
-    }
-  }
-  err.args.groups = function(type, N){
-    if (N != length(fun.list)) {
-      stopf("number of %s =%d must be same as %s =%d", group.desc, length(fun.list), type, N)
-    }
+  if (length(dup.funs <- duplicated_values(names(fun.list)))) {
+    stopf("elements of fun.list should be uniquely named, problems: %s", brackify(dup.funs))
   }
-  err.names.unique(group.desc, names(fun.list))
   # 2. compute initial group data table, used as variable_table attribute.
   group.mat = if (!missing(pattern)) {
     if (!is.character(pattern)) {
@@ -117,7 +104,9 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
     if (is.null(start)) {
       stopf("pattern must contain at least one capture group (parenthesized sub-pattern)")
     }
-    err.args.groups("number of capture groups in pattern", ncol(start))
+    if (ncol(start) != length(fun.list)) {
+      stopf("number of elements of fun.list (%d) must be the same as the number of capture groups in pattern (%d)", length(fun.list), ncol(start))
+    }
     end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L
     measure.vec <- cols[measure.vec.i]
     names.mat = matrix(measure.vec, nrow(start), ncol(start))
@@ -132,12 +121,16 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
     if (n.groups == 1) {
       stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
     }
-    err.args.groups("max number of items after splitting column names", n.groups)
+    if (n.groups != length(fun.list)) {
+      stopf("number of elements of fun.list (%d) must be the same as the max number of items after splitting column names (%d)", length(fun.list), n.groups)
+    }
     measure.vec.i = which(vector.lengths==n.groups)
     measure.vec = cols[measure.vec.i]
     do.call(rbind, list.of.vectors[measure.vec.i])
   }
-  err.names.unique("measured columns", measure.vec)
+  if (length(dup.measures <- duplicated_values(measure.vec))) {
+    stopf("measured columns should be uniquely named, problems: %s", brackify(dup.measures))
+  }
   uniq.mat = unique(group.mat)
   if (nrow(uniq.mat) < nrow(group.mat)) {
     stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat))
diff --git a/R/utils.R b/R/utils.R
index 2f91491be..4a1aadd43 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -34,11 +34,17 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) {
   if (!anyDuplicated(nm <- names(x))) return(invisible())
   duplicate_names = unique(nm[duplicated(nm)])
   stopf(ngettext(length(duplicate_names),
-                  "%s has duplicated column name %s. Please remove or rename the duplicate and try again.",
-                  "%s has duplicated column names %s. Please remove or rename the duplicates and try again."),
+                 "%s has duplicated column name %s. Please remove or rename the duplicate and try again.",
+                 "%s has duplicated column names %s. Please remove or rename the duplicates and try again."),
         table_name, brackify(duplicate_names), domain=NA)
 }
 
+duplicated_values = function(x) {
+  # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case
+  if (!anyDuplicated(x)) return(vector(typeof(x)))
+  unique(x[duplicated(x)])
+}
+
 # TODO(R>=4.0.0): Remove this workaround. From R 4.0.0, rep_len() dispatches rep.Date(), which we need.
 #   Before that, rep_len() strips attributes --> breaks data.table()'s internal recycle() helper.
 #   This also impacts test 2 in S4.Rraw, because the error message differs for rep.int() vs. rep_len().
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 2b3958acd..322530139 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -17301,11 +17301,11 @@ test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", p
 test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]")
 test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]")
 test(2183.00027, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim="bar"), sep=".")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: dim")
-test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list =3 must be same as max number of items after splitting column names =2")
+test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list (3) must be the same as the max number of items after splitting column names (2)")
 test(2183.00042, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function()1), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr")
 test(2183.00043, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=interactive), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr")
 test(2183.00044, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function(x)1), pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr")
-test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list =3 must be same as number of capture groups in pattern =2")
+test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list (3) must be the same as the number of capture groups in pattern (2)")
 test(2183.00048, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, value.name=NULL), sep=".")), error="elements of fun.list should be uniquely named, problems: [value.name]")
 # measure with factor conversion.
 myfac = function(x)factor(x)#user-defined conversion function.
@@ -17348,7 +17348,7 @@ test(2183.24, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep=".")
 test(2183.25, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width"))
 test(2183.26, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value"))
 test(2183.27, melt(iris.dt, measure.vars=measure(value.name, dim="bar", sep=".")), error="each ... argument to measure must be a function with at least one argument, problem: dim")
-test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of ... arguments to measure =3 must be same as max number of items after splitting column names =2")
+test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of elements of fun.list (3) must be the same as the max number of items after splitting column names (2)")
 test(2183.29, melt(iris.dt, measure.vars=measure()), error="each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
 # patterns with iris data.
 test(2183.40, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value"))
@@ -17357,10 +17357,10 @@ test(2183.41, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="(
 test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr")
 test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr")
 test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr")
-test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2")
+test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of elements of fun.list (3) must be the same as the number of capture groups in pattern (2)")
 test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]")
 test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]")
-test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="... arguments to measure should be uniquely named, problems: [value.name]")
+test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="elements of fun.list should be uniquely named, problems: [value.name]")
 # measure with factor conversion.
 myfac = function(x)factor(x)#user-defined conversion function.
 test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1)))
diff --git a/man/measure.Rd b/man/measure.Rd
index 73a315e00..b7933e17e 100644
--- a/man/measure.Rd
+++ b/man/measure.Rd
@@ -18,8 +18,7 @@
 }
 \usage{
 measure(\dots, sep, pattern, cols, multiple.keyword="value.name")
-measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name",
-  group.desc="elements of fun.list")
+measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name")
 }
 \arguments{
   \item{\dots}{One or more (1) symbols (without argument name; symbol
@@ -44,7 +43,6 @@ measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name",
     value columns (with names defined by the unique values in that
     group). Otherwise if the string not used as a group name, then
     measure returns a vector and melt returns a single value column.}
-  \item{group.desc}{Internal, used in error messages.}
 }
 \seealso{
   \code{\link{melt}},