Closes #1080. 'uniqueN' gains 'by' argument.

Rdatatable · Mar 16, 2015 · 9ef6a25 · 9ef6a25
1 parent b50163e
commit 9ef6a25
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 4 deletions.
diff --git a/R/duplicated.R b/R/duplicated.R
@@ -90,8 +90,10 @@ anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=
 # simple straightforward helper function to get the number 
 # of groups in a vector or data.table. Here by data.table, 
 # we really mean `.SD` - used in a grouping operation
-uniqueN <- function(x) {
+uniqueN <- function(x, by = if (is.data.table(x)) key(x) else NULL) {
     if (!is.atomic(x) && !is.data.frame(x))
         stop("x must be an atomic vector or data.frames/data.tables")
-    length(attr(forderv(x, retGrp=TRUE), 'starts'))
+    if (is.atomic(x)) x = as_list(x)
+    if (is.null(by)) by = seq_along(x)
+    length(attr(forderv(x, by=by, retGrp=TRUE), 'starts'))
 }
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@
   9. `rbindlist` gains `idcol` argument which can be used to generate an index column. If `idcol=TRUE`, the column is automatically named `.id`. Instead you can also provide a column name directly. If the input list has no names, indices are automatically generated. Closes [#591](https://github.com/Rdatatable/data.table/issues/591). Also thanks to @KevinUshey for filing [#356](https://github.com/Rdatatable/data.table/issues/356).
 
   10. A new helper function `uniqueN` is now implemented. It is equivalent to `length(unique(x))` but much faster. It accepts `atomic vectors`, `data.frames` and `data.tables` as input and returns the number of unique rows. For example, `DT[, .(uN = uniqueN(.SD)), by=x]` returns the number of unique rows within each group of `x`. Thanks to @DavidArenburg as well for the FR.
+    * `uniqueN` gains a `by` argument which is equal to `key(x)` when `x` is a `data.table` so that the behaviour is identical to `duplicated()` and and `unique` methods for `data.table`. Thanks to @kevinmistry for the report. Closes [#1080](https://github.com/Rdatatable/data.table/issues/1080).
 
   11. Implemented `transpose()` to transpose a list and `tstrsplit` which is a wrapper for `transpose(strsplit(...))`. This is particularly useful in scenarios where a column has to be split and the resulting list has to be assigned to multiple columns. See `?transpose` and `?tstrsplit`, [#1025](https://github.com/Rdatatable/data.table/issues/1025) and [#1026](https://github.com/Rdatatable/data.table/issues/1026) for usage scenarios. Closes both #1025 and #1026 issues.
 

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -6203,6 +6203,12 @@ test(1502.1, dt1["a", z := NULL], error="When deleting columns, i should not be
 # this shouldn't segfault on 'dt1[...]'
 test(1502.2, dt1["a", z := 42L], dt2["a", z := 42L])
 
+# fix for #1080
+dt = data.table(col1 = c(1,2,3,2,5,3,2), col2 = c(0,9,8,9,6,5,4), key=c("col1"))
+test(1503.1, uniqueN(dt), 4L) # default on key columns
+test(1503.2, uniqueN(dt, by=NULL), 6L) # on all columns
+test(1503.3, uniqueN(dt$col1), 4L) # on just that column
+
 ##########################
 
 

diff --git a/man/duplicated.Rd b/man/duplicated.Rd
@@ -26,7 +26,7 @@
 
 \method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...)
 
-uniqueN(x)
+uniqueN(x, by=if (is.data.table(x)) key(x) else NULL)
 }
 \arguments{
   \item{x}{ A data.table. \code{uniqueN} accepts atomic vectors and data.frames as well.}
@@ -58,6 +58,8 @@ uniqueN(x)
 
      \code{anyDuplicated} returns a integer value with the index of first duplicate. If none exists, 0L is returned.
 
+     \code{uniqueN} returns the number of unique elements in the vector, \code{data.frame} or \code{data.table}.
+
 }
 \seealso{ \code{\link{setNumericRounding}}, \code{\link{data.table}}, \code{\link{duplicated}}, \code{\link{unique}}, \code{\link{all.equal}} }
 \examples{
@@ -93,8 +95,10 @@ unique(DT, by="B", fromLast=TRUE)
 anyDuplicated(DT, by=c("A", "B"))    # 3L
 any(duplicated(DT, by=c("A", "B")))  # TRUE
 
-# uniqueN, total unique rows
+# uniqueN, unique rows on key columns
 uniqueN(DT)
+# uniqueN, unique rows on all all columns
+uniqueN(DT, by=NULL)
 # uniqueN while grouped by "A"
 DT[, .(uN=uniqueN(.SD)), by=A]
 }