diff --git a/R/duplicated.R b/R/duplicated.R index efb656f74..3cbbc9ab8 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -90,8 +90,10 @@ anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by= # simple straightforward helper function to get the number # of groups in a vector or data.table. Here by data.table, # we really mean `.SD` - used in a grouping operation -uniqueN <- function(x) { +uniqueN <- function(x, by = if (is.data.table(x)) key(x) else NULL) { if (!is.atomic(x) && !is.data.frame(x)) stop("x must be an atomic vector or data.frames/data.tables") - length(attr(forderv(x, retGrp=TRUE), 'starts')) + if (is.atomic(x)) x = as_list(x) + if (is.null(by)) by = seq_along(x) + length(attr(forderv(x, by=by, retGrp=TRUE), 'starts')) } diff --git a/README.md b/README.md index 349e8cdc8..ddac3336c 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ 9. `rbindlist` gains `idcol` argument which can be used to generate an index column. If `idcol=TRUE`, the column is automatically named `.id`. Instead you can also provide a column name directly. If the input list has no names, indices are automatically generated. Closes [#591](https://github.com/Rdatatable/data.table/issues/591). Also thanks to @KevinUshey for filing [#356](https://github.com/Rdatatable/data.table/issues/356). 10. A new helper function `uniqueN` is now implemented. It is equivalent to `length(unique(x))` but much faster. It accepts `atomic vectors`, `data.frames` and `data.tables` as input and returns the number of unique rows. For example, `DT[, .(uN = uniqueN(.SD)), by=x]` returns the number of unique rows within each group of `x`. Thanks to @DavidArenburg as well for the FR. + * `uniqueN` gains a `by` argument which is equal to `key(x)` when `x` is a `data.table` so that the behaviour is identical to `duplicated()` and and `unique` methods for `data.table`. Thanks to @kevinmistry for the report. Closes [#1080](https://github.com/Rdatatable/data.table/issues/1080). 11. Implemented `transpose()` to transpose a list and `tstrsplit` which is a wrapper for `transpose(strsplit(...))`. This is particularly useful in scenarios where a column has to be split and the resulting list has to be assigned to multiple columns. See `?transpose` and `?tstrsplit`, [#1025](https://github.com/Rdatatable/data.table/issues/1025) and [#1026](https://github.com/Rdatatable/data.table/issues/1026) for usage scenarios. Closes both #1025 and #1026 issues. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 07eab385f..cbc252026 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6203,6 +6203,12 @@ test(1502.1, dt1["a", z := NULL], error="When deleting columns, i should not be # this shouldn't segfault on 'dt1[...]' test(1502.2, dt1["a", z := 42L], dt2["a", z := 42L]) +# fix for #1080 +dt = data.table(col1 = c(1,2,3,2,5,3,2), col2 = c(0,9,8,9,6,5,4), key=c("col1")) +test(1503.1, uniqueN(dt), 4L) # default on key columns +test(1503.2, uniqueN(dt, by=NULL), 6L) # on all columns +test(1503.3, uniqueN(dt$col1), 4L) # on just that column + ########################## diff --git a/man/duplicated.Rd b/man/duplicated.Rd index d9ffac73e..e485c9960 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -26,7 +26,7 @@ \method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...) -uniqueN(x) +uniqueN(x, by=if (is.data.table(x)) key(x) else NULL) } \arguments{ \item{x}{ A data.table. \code{uniqueN} accepts atomic vectors and data.frames as well.} @@ -58,6 +58,8 @@ uniqueN(x) \code{anyDuplicated} returns a integer value with the index of first duplicate. If none exists, 0L is returned. + \code{uniqueN} returns the number of unique elements in the vector, \code{data.frame} or \code{data.table}. + } \seealso{ \code{\link{setNumericRounding}}, \code{\link{data.table}}, \code{\link{duplicated}}, \code{\link{unique}}, \code{\link{all.equal}} } \examples{ @@ -93,8 +95,10 @@ unique(DT, by="B", fromLast=TRUE) anyDuplicated(DT, by=c("A", "B")) # 3L any(duplicated(DT, by=c("A", "B"))) # TRUE -# uniqueN, total unique rows +# uniqueN, unique rows on key columns uniqueN(DT) +# uniqueN, unique rows on all all columns +uniqueN(DT, by=NULL) # uniqueN while grouped by "A" DT[, .(uN=uniqueN(.SD)), by=A] }