Closes #884, partly #756,#1019. Implements uniqueN.

Rdatatable · Jan 25, 2015 · acc4290 · acc4290
1 parent f4a06eb
commit acc4290
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 1 deletion.
diff --git a/NAMESPACE b/NAMESPACE
@@ -25,6 +25,7 @@ export(.SD,.N,.I,.GRP,.BY,.EACHI)
 export(rleid)
 export(rleidv)
 export(as.xts.data.table)
+export(uniqueN)
 
 S3method("[", data.table)
 S3method("[<-", data.table)

diff --git a/R/duplicated.R b/R/duplicated.R
@@ -85,3 +85,13 @@ anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=
     if (!length(idx)) idx=0L
     idx
 }
+
+
+# simple straightforward helper function to get the number 
+# of groups in a vector or data.table. Here by data.table, 
+# we really mean `.SD` - used in a grouping operation
+uniqueN <- function(x) {
+    if (!is.atomic(x) && !is.data.frame(x))
+        stop("x must be an atomic vector or data.frames/data.tables")
+    length(attr(forderv(x, retGrp=TRUE), 'starts'))
+}
diff --git a/README.md b/README.md
@@ -32,6 +32,8 @@
 
   9. `rbindlist` gains `idcol` argument which can be used to generate an index column. If `idcol=TRUE`, the column is automatically named `.id`. Instead you can also provide a column name directly. If the input list has no names, indices are automatically generated. Closes [#591](https://github.com/Rdatatable/data.table/issues/591). Also thanks to @KevinUshey for filing [#356](https://github.com/Rdatatable/data.table/issues/356).
 
+  10. A new helper function `uniqueN` is now implemented. It is equivalent to `length(unique(x))` but much faster. It accepts `atomic vectors`, `data.frames` and `data.tables` as input and returns the number of unique rows. For example, DT[, .(uN = uniqueN(.SD)), by=x]` returns the number of unique rows within each group of `x`.
+
 #### BUG FIXES
 
   1. `if (TRUE) DT[,LHS:=RHS]` no longer prints, [#869](https://github.com/Rdatatable/data.table/issues/869). Tests added. To get this to work we've had to live with one downside: if a `:=` is used inside a function with no `DT[]` before the end of the function, then the next time `DT` is typed at the prompt, nothing will be printed. A repeated `DT` will print. To avoid this: include a `DT[]` after the last `:=` in your function. If that is not possible (e.g., it's not a function you can change) then `print(DT)` and `DT[]` at the prompt are guaranteed to print. As before, adding an extra `[]` on the end of `:=` query is a recommended idiom to update and then print; e.g. `> DT[,foo:=3L][]`. Thanks to Jureiss for reporting.

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -5851,6 +5851,11 @@ test(1474.2, frank(DT, y, ties.method="dense"), frank(DT$y, ties.method="dense")
 test(1474.3, frankv(DT, "y", order=-1L, ties.method="dense"), frankv(-DT$y, ties.method="dense"))
 test(1474.4, frank(DT, -y, ties.method="dense"), frank(-DT$y, ties.method="dense"))
 
+# uniqueN, #884, part of #756 and part of #1019
+DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6))
+test(1475.1, uniqueN(DT), 10L)
+test(1475.2, DT[, .(uN=uniqueN(.SD)), by=A], data.table(A=1:3, uN=c(3L,4L,3L)))
+
 ##########################
 
 

diff --git a/man/duplicated.Rd b/man/duplicated.Rd
@@ -5,6 +5,7 @@
 \alias{unique.data.table}
 \alias{anyDuplicated}
 \alias{anyDuplicated.data.table}
+\alias{uniqueN}
 \title{ Determine Duplicate Rows }
 \description{
      \code{duplicated} returns a logical vector indicating which rows of a \code{data.table} (by 
@@ -14,6 +15,9 @@
      (when no key) duplicated rows by all columns removed.
 
      \code{anyDuplicated} returns the \emph{index} \code{i} of the first duplicated entry if there is one, and 0 otherwise. 
+
+     \code{uniqueN} is equivalent to \code{length(unique(x))} but much faster. It accepts \code{atomic vectors}, \code{data.frames} and \code{data.tables}. The number of unique rows are computed directly without materialising the intermediate unique data.table and is therefore memory efficient as well.
+
 }
 \usage{
 \method{duplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...)
@@ -22,9 +26,10 @@
 
 \method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=key(x), ...)
 
+uniqueN(x)
 }
 \arguments{
-  \item{x}{ A data.table. }
+  \item{x}{ A data.table. \code{uniqueN} accepts atomic vectors and data.frames as well.}
   \item{\dots}{ Not used at this time. }
   \item{incomparables}{ Not used. Here for S3 method consistency. }
   \item{fromLast}{ logical indicating if duplication should be considered from the reverse side, i.e., the last (or rightmost) of identical elements would correspond to \code{duplicated = FALSE}.}
@@ -87,6 +92,11 @@ unique(DT, by="B", fromLast=TRUE)
 # anyDuplicated
 anyDuplicated(DT, by=c("A", "B"))    # 3L
 any(duplicated(DT, by=c("A", "B")))  # TRUE
+
+# uniqueN, total unique rows
+uniqueN(DT)
+# uniqueN while grouped by "A"
+DT[, .(uN=uniqueN(.SD)), by=A]
 }
 \keyword{ data }