From 960ff28b76387ee152d48b4bfc27198f53f070d3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 24 Sep 2024 13:52:41 -0400 Subject: [PATCH] Refactor internal helpers which generate fragmented messages (#6534) * Refacator internal helpers which generate fragmented messages * Update test error strings * deprecate internal group.desc argument --- R/fmelt.R | 35 ++++++++++++++--------------------- R/utils.R | 10 ++++++++-- inst/tests/tests.Rraw | 10 +++++----- man/measure.Rd | 4 +--- 4 files changed, 28 insertions(+), 31 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index 3f924c2e4..55a6fb0ad 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -64,14 +64,11 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { } fun.list[[fun.i]] = fun } - measurev.args = c( - list(fun.list), - L[formal.i.vec], - list(group.desc="... arguments to measure")) + measurev.args = c(list(fun.list), L[formal.i.vec]) do.call(measurev, measurev.args) } -measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name", group.desc="elements of fun.list"){ +measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name"){ # 1. basic error checking. if (!missing(sep) && !missing(pattern)) { stopf("both sep and pattern arguments used; must use either sep or pattern (not both)") @@ -88,21 +85,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na which(!nzchar(names(fun.list))) } if (length(prob.i)) { - stopf("in measurev, %s must be named, problems: %s", group.desc, brackify(prob.i)) + stopf("in measurev, elements of fun.list must be named, problems: %s", brackify(prob.i)) } - err.names.unique = function(err.what, name.vec) { - name.tab = table(name.vec) - bad.counts = name.tab[1 < name.tab] - if (length(bad.counts)) { - stopf("%s should be uniquely named, problems: %s", err.what, brackify(names(bad.counts))) - } - } - err.args.groups = function(type, N){ - if (N != length(fun.list)) { - stopf("number of %s =%d must be same as %s =%d", group.desc, length(fun.list), type, N) - } + if (length(dup.funs <- duplicated_values(names(fun.list)))) { + stopf("elements of fun.list should be uniquely named, problems: %s", brackify(dup.funs)) } - err.names.unique(group.desc, names(fun.list)) # 2. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { if (!is.character(pattern)) { @@ -117,7 +104,9 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na if (is.null(start)) { stopf("pattern must contain at least one capture group (parenthesized sub-pattern)") } - err.args.groups("number of capture groups in pattern", ncol(start)) + if (ncol(start) != length(fun.list)) { + stopf("number of elements of fun.list (%d) must be the same as the number of capture groups in pattern (%d)", length(fun.list), ncol(start)) + } end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L measure.vec <- cols[measure.vec.i] names.mat = matrix(measure.vec, nrow(start), ncol(start)) @@ -132,12 +121,16 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na if (n.groups == 1) { stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") } - err.args.groups("max number of items after splitting column names", n.groups) + if (n.groups != length(fun.list)) { + stopf("number of elements of fun.list (%d) must be the same as the max number of items after splitting column names (%d)", length(fun.list), n.groups) + } measure.vec.i = which(vector.lengths==n.groups) measure.vec = cols[measure.vec.i] do.call(rbind, list.of.vectors[measure.vec.i]) } - err.names.unique("measured columns", measure.vec) + if (length(dup.measures <- duplicated_values(measure.vec))) { + stopf("measured columns should be uniquely named, problems: %s", brackify(dup.measures)) + } uniq.mat = unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat)) diff --git a/R/utils.R b/R/utils.R index 2f91491be..4a1aadd43 100644 --- a/R/utils.R +++ b/R/utils.R @@ -34,11 +34,17 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) { if (!anyDuplicated(nm <- names(x))) return(invisible()) duplicate_names = unique(nm[duplicated(nm)]) stopf(ngettext(length(duplicate_names), - "%s has duplicated column name %s. Please remove or rename the duplicate and try again.", - "%s has duplicated column names %s. Please remove or rename the duplicates and try again."), + "%s has duplicated column name %s. Please remove or rename the duplicate and try again.", + "%s has duplicated column names %s. Please remove or rename the duplicates and try again."), table_name, brackify(duplicate_names), domain=NA) } +duplicated_values = function(x) { + # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case + if (!anyDuplicated(x)) return(vector(typeof(x))) + unique(x[duplicated(x)]) +} + # TODO(R>=4.0.0): Remove this workaround. From R 4.0.0, rep_len() dispatches rep.Date(), which we need. # Before that, rep_len() strips attributes --> breaks data.table()'s internal recycle() helper. # This also impacts test 2 in S4.Rraw, because the error message differs for rep.int() vs. rep_len(). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2b3958acd..322530139 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17301,11 +17301,11 @@ test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", p test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]") test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]") test(2183.00027, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim="bar"), sep=".")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: dim") -test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list =3 must be same as max number of items after splitting column names =2") +test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list (3) must be the same as the max number of items after splitting column names (2)") test(2183.00042, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function()1), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") test(2183.00043, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=interactive), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") test(2183.00044, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function(x)1), pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") -test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list =3 must be same as number of capture groups in pattern =2") +test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list (3) must be the same as the number of capture groups in pattern (2)") test(2183.00048, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, value.name=NULL), sep=".")), error="elements of fun.list should be uniquely named, problems: [value.name]") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. @@ -17348,7 +17348,7 @@ test(2183.24, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep=".") test(2183.25, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width")) test(2183.26, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) test(2183.27, melt(iris.dt, measure.vars=measure(value.name, dim="bar", sep=".")), error="each ... argument to measure must be a function with at least one argument, problem: dim") -test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of ... arguments to measure =3 must be same as max number of items after splitting column names =2") +test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of elements of fun.list (3) must be the same as the max number of items after splitting column names (2)") test(2183.29, melt(iris.dt, measure.vars=measure()), error="each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") # patterns with iris data. test(2183.40, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) @@ -17357,10 +17357,10 @@ test(2183.41, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="( test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") -test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") +test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of elements of fun.list (3) must be the same as the number of capture groups in pattern (2)") test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]") test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]") -test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="... arguments to measure should be uniquely named, problems: [value.name]") +test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="elements of fun.list should be uniquely named, problems: [value.name]") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) diff --git a/man/measure.Rd b/man/measure.Rd index 73a315e00..b7933e17e 100644 --- a/man/measure.Rd +++ b/man/measure.Rd @@ -18,8 +18,7 @@ } \usage{ measure(\dots, sep, pattern, cols, multiple.keyword="value.name") -measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name", - group.desc="elements of fun.list") +measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name") } \arguments{ \item{\dots}{One or more (1) symbols (without argument name; symbol @@ -44,7 +43,6 @@ measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name", value columns (with names defined by the unique values in that group). Otherwise if the string not used as a group name, then measure returns a vector and melt returns a single value column.} - \item{group.desc}{Internal, used in error messages.} } \seealso{ \code{\link{melt}},