Closes #1282. merge gains 'sort' argument. Added tests.

Rdatatable · Aug 27, 2015 · 69e3411 · 69e3411
1 parent f8c9c86
commit 69e3411
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 5 deletions.
diff --git a/R/merge.R b/R/merge.R
@@ -1,5 +1,7 @@
 merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
-                             all.y = all, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
+                             all.y = all, sort = TRUE, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
+    if (!sort %in% c(TRUE, FALSE))
+      stop("Argument 'sort' should be logical TRUE/FALSE")
     if (!is.data.table(y)) {
         y = as.data.table(y)
         if (missing(by) && missing(by.x)) {
@@ -71,6 +73,8 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
     # fix for #1290, make sure by.y order is set properly before naming
     setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend))
     setnames(dt, c(by.x, start, end))
-    if (nrow(dt) > 0) setkeyv(dt, by.x)
+    if (nrow(dt) > 0L) {
+      setkeyv(dt, if (sort) by.x else NULL)
+    }
     dt
 }
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@
 
   25. data.tables can join now without having to set keys by using the new `on` argument. For example: `DT1[DT2, on=c(x = "y")]` would join column 'y' of `DT2` with 'x' of `DT1`. `DT1[DT2, on="y"]` would join on column 'y' on both data.tables. Closes [#1130](https://github.com/Rdatatable/data.table/issues/1130) partly.
 
-  26. `merge.data.table` gains arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637) and [#1130](https://github.com/Rdatatable/data.table/issues/1130). No copies are made even when the specified columns aren't key columns in data.tables, and therefore much more fast and memory efficient. Thanks to @blasern for the initial PRs.
+  26. `merge.data.table` gains arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637) and [#1130](https://github.com/Rdatatable/data.table/issues/1130). No copies are made even when the specified columns aren't key columns in data.tables, and therefore much more fast and memory efficient. Thanks to @blasern for the initial PRs. Also gains logical argument `sort` (like base R). Closes [#1282](https://github.com/Rdatatable/data.table/issues/1282).
 
   27. `fread()` gains `eocnding` argument. Acceptable values are "unknown", "UTF-8" and "Latin-1" with default value of "unknown". Closes [#563](https://github.com/Rdatatable/data.table/issues/563). Thanks to @BenMarwick for the original report and to the many requests from others, and Q on SO.
 

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -6718,6 +6718,13 @@ ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A
 ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A"))
 test(1543.5, ans1, ans2)
 
+# test for sort=FALSE argument, #1282
+set.seed(1L)
+d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE)
+d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE)
+test(1543.7, merge(setDT(d1), setDT(d2), by="A", sort=FALSE), 
+       setDT(merge(setDF(d1), setDF(d2), by="A", sort=FALSE)))
+
 # thinko in merge dupnames handling
 dt1 = data.table(x=1:5, y1=2L, y2=3L)
 dt2 = data.table(a=4:6, y2=TRUE, y1 = FALSE)

diff --git a/man/merge.Rd b/man/merge.Rd
@@ -15,7 +15,7 @@
 
 \usage{
 \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL,
-all = FALSE, all.x = all, all.y = all, suffixes = c(".x", ".y"),
+all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"),
 allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
 ...)
 }
@@ -53,7 +53,13 @@ allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
   \item{all.y}{
     logical; analogous to \code{all.x} above.
   }
-
+  
+  \item{sort}{
+    logical. If \code{TRUE} (default), the merged \code{data.table} is sorted 
+    by setting the key to the \code{by / by.x} columns. If \code{FALSE}, the 
+    result is not sorted.
+  }
+  
   \item{suffixes}{
     A \code{character(2)} specifying the suffixes to be used for making
     non-\code{by} column names unique. The suffix behavior works in a similar