Update group_by() algorithm to utilize vec_locate_sorted_groups() (…

…#6297) * Use `vec_locate_sorted_groups()` in `group_by()` * Update the `group_by()` documentation with a section on ordering * NEWS bullet * Don't mention that this is an implementation detail * Simplify code with `%||%` * Refine existing locale checker to work with `LC_COLLATE` * Switch to `dplyr.legacy_locale`
tidyverse · Jul 11, 2022 · 5e27ef1 · 5e27ef1
1 parent 971b73f
commit 5e27ef1
Show file tree

Hide file tree

Showing 10 changed files with 161 additions and 17 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,17 @@
 # dplyr (development version)
 
+* `group_by()` now uses a new algorithm for computing groups. It is often faster
+  than the previous approach (especially when there are many groups), and in
+  most cases there should be no changes. The exception is with character vector
+  group columns, which are now internally ordered in the C locale rather than
+  the system locale and may result in differently ordered results when you
+  follow up a `group_by()` with functions that use the group data, such as
+  `summarise()` or `group_split()` (#4406, #6297).
+
+  See the Ordering section of `?group_by()` for more information. For a full
+  explanation of this change, refer to this
+  [tidyup](https://github.com/tidyverse/tidyups/blob/main/006-dplyr-group-by-ordering.md).
+
 * `if_else()` has been rewritten to utilize vctrs. This comes with most of the
   same benefits as the `case_when()` rewrite. In particular, `if_else()` now
   takes the common type of `true`, `false`, and `missing` when determining what

diff --git a/R/all-equal.r b/R/all-equal.r
@@ -63,8 +63,8 @@ equal_data_frame <- function(x, y, ignore_col_order = TRUE, ignore_row_order = T
     y <- as_tibble(y, .name_repair = "universal")
   # })
 
-  x_split <- vec_split_id_order(x)
-  y_split <- vec_split_id_order(y[, names(x), drop = FALSE])
+  x_split <- dplyr_locate_sorted_groups(x)
+  y_split <- dplyr_locate_sorted_groups(y[, names(x), drop = FALSE])
 
   # keys must be identical
   msg <- ""

diff --git a/R/group-by.r b/R/group-by.r
@@ -35,6 +35,35 @@
 #'
 #' * `group_by()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}.
 #' * `ungroup()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}.
+#'
+#' @section Ordering:
+#' Currently, `group_by()` internally orders the groups in ascending order. This
+#' results in ordered output from functions that aggregate groups, such as
+#' [summarise()].
+#'
+#' When used as grouping columns, character vectors are ordered in the C locale
+#' for performance and reproducibility across R sessions. If the resulting
+#' ordering of your grouped operation matters and is dependent on the locale,
+#' you should follow up the grouped operation with an explicit call to
+#' [arrange()] and set the `.locale` argument. For example:
+#'
+#' ```
+#' data %>%
+#'   group_by(chr) %>%
+#'   summarise(avg = mean(x)) %>%
+#'   arrange(chr, .locale = "en")
+#' ```
+#'
+#' This is often useful as a preliminary step before generating content intended
+#' for humans, such as an HTML table.
+#'
+#' Prior to dplyr 1.1.0, character vector grouping columns were ordered in the
+#' system locale. If you need to temporarily revert to this behavior, you can
+#' set the global option `dplyr.legacy_locale` to `TRUE`, but this should be
+#' used sparingly and you should expect this option to be removed in a future
+#' version of dplyr. It is better to update existing code to explicitly call
+#' `arrange()` instead.
+#'
 #' @export
 #' @examples
 #' by_cyl <- mtcars %>% group_by(cyl)

diff --git a/R/grouped-df.r b/R/grouped-df.r
@@ -40,7 +40,7 @@ compute_groups <- function(data, vars, drop = FALSE) {
 
   # Only train the dictionary based on selected columns
   group_vars <- as_tibble(data)[vars]
-  split_key_loc <- vec_split_id_order(group_vars)
+  split_key_loc <- dplyr_locate_sorted_groups(group_vars)
   old_keys <- split_key_loc$key
   old_rows <- split_key_loc$loc
 
@@ -294,11 +294,16 @@ expand_groups <- function(old_groups, positions, nr) {
   .Call(`dplyr_expand_groups`, old_groups, positions, nr)
 }
 
-vec_split_id_order <- function(x) {
-  split_id <- vec_group_loc(x)
-  split_id$loc <- new_list_of(split_id$loc, ptype = integer())
+dplyr_locate_sorted_groups <- function(x) {
+  out <- vec_locate_sorted_groups(x, nan_distinct = TRUE)
+  out$loc <- new_list_of(out$loc, ptype = integer())
 
-  vec_slice(split_id, vec_order_base(split_id$key))
+  if (dplyr_legacy_locale()) {
+    # Temporary legacy support for respecting the system locale
+    out <- vec_slice(out, vec_order_base(out$key))
+  }
+
+  out
 }
 
 group_intersect <- function(x, new) {

diff --git a/R/locale.R b/R/locale.R
@@ -94,3 +94,19 @@ dplyr_locale <- function() {
 has_minimum_stringi <- function() {
   is_installed("stringi", version = "1.5.3")
 }
+
+dplyr_legacy_locale <- function() {
+  # Used to determine if `group_by()` and `arrange()` should use
+  # base R's `order()` for sorting, which respects the system locale and was
+  # our sorting engine pre-1.1.0.
+  out <- peek_option("dplyr.legacy_locale") %||% FALSE
+
+  if (!is_bool(out)) {
+    abort(
+      "Global option `dplyr.legacy_locale` must be a single `TRUE` or `FALSE`.",
+      call = NULL
+    )
+  }
+
+  out
+}
diff --git a/man/group_by.Rd b/man/group_by.Rd
diff --git a/tests/testthat/_snaps/locale.md b/tests/testthat/_snaps/locale.md
@@ -6,3 +6,11 @@
       Error in `dplyr_locale()`:
       ! If set, the global option `dplyr.locale` must be a string.
 
+# `dplyr_legacy_locale()` respects `dplyr.legacy_locale`
+
+    Code
+      dplyr_legacy_locale()
+    Condition
+      Error:
+      ! Global option `dplyr.legacy_locale` must be a single `TRUE` or `FALSE`.
+
diff --git a/tests/testthat/helper-encoding.R b/tests/testthat/helper-encoding.R
@@ -28,17 +28,25 @@ get_alien_lang_string <- function() {
   lang_strings$different[[1L]]
 }
 
-try_encoding <- function(enc) {
-  orig_encoding <- Sys.getlocale("LC_CTYPE")
-  on.exit(Sys.setlocale("LC_CTYPE", orig_encoding), add = TRUE)
-  tryCatch({
-    Sys.setlocale("LC_CTYPE", enc)
-    TRUE
-  },
-  warning = function(w) FALSE,
-  error = function(e) FALSE
+has_locale <- function(locale, category) {
+  original <- Sys.getlocale(category = category)
+  on.exit(Sys.setlocale(category = category, locale = original), add = TRUE)
+
+  tryCatch(
+    expr = {
+      Sys.setlocale(category = category, locale = locale)
+      TRUE
+    },
+    warning = function(w) FALSE,
+    error = function(e) FALSE
   )
 }
+has_collate_locale <- function(locale) {
+  has_locale(locale = locale, category = "LC_COLLATE")
+}
+has_ctype_locale <- function(enc) {
+  has_locale(locale = enc, category = "LC_CTYPE")
+}
 
 non_utf8_encoding <- function(enc = NULL) {
   if (!l10n_info()$`UTF-8`) {
@@ -50,7 +58,7 @@ non_utf8_encoding <- function(enc = NULL) {
     "fr_CH.ISO8859-1",
     "fr_CH.ISO8859-15"
   )
-  available <- vapply(enc, try_encoding, logical(1))
+  available <- vapply(enc, has_ctype_locale, logical(1))
   if (any(available)) {
     enc[available][1]
   } else {

diff --git a/tests/testthat/test-grouped-df.r b/tests/testthat/test-grouped-df.r
@@ -189,3 +189,26 @@ test_that("helper gives meaningful error messages", {
     (expect_error(grouped_df(data.frame(x = 1), 1)))
   })
 })
+
+test_that("NA and NaN are in separate groups at the end", {
+  df <- tibble(x = c(NA, NaN, NA, 1))
+  result <- compute_groups(df, "x")
+  expect_identical(result$x, c(1, NaN, NA))
+})
+
+test_that("groups are ordered in the C locale", {
+  df <- tibble(x = c("a", "A", "Z", "b"))
+  result <- compute_groups(df, "x")
+  expect_identical(result$x, c("A", "Z", "a", "b"))
+})
+
+test_that("using the global option `dplyr.legacy_locale` forces the system locale", {
+  skip_if_not(has_collate_locale("en_US"), message = "Can't use 'en_US' locale")
+
+  local_options(dplyr.legacy_locale = TRUE)
+  withr::local_collate("en_US")
+
+  df <- tibble(x = c("a", "A", "Z", "b"))
+  result <- compute_groups(df, "x")
+  expect_identical(result$x, c("a", "A", "b", "Z"))
+})
diff --git a/tests/testthat/test-locale.R b/tests/testthat/test-locale.R
@@ -11,3 +11,17 @@ test_that("`dplyr_locale()` respects `dplyr.locale`", {
     dplyr_locale()
   })
 })
+
+test_that("`dplyr_legacy_locale()` is `FALSE` by default", {
+  expect_false(dplyr_legacy_locale())
+})
+
+test_that("`dplyr_legacy_locale()` respects `dplyr.legacy_locale`", {
+  local_options(dplyr.legacy_locale = TRUE)
+  expect_true(dplyr_legacy_locale())
+
+  local_options(dplyr.legacy_locale = 1)
+  expect_snapshot(error = TRUE, {
+    dplyr_legacy_locale()
+  })
+})