Closes #2644: Use all variables for extract_duplicate_records by default

* update `by_vars` to use all variables * add test for `by_vars = NULL` * update documentation * update NEWS
pharmaverse · Jan 22, 2025 · 27b6407 · 27b6407
1 parent 762612e
commit 27b6407
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,8 @@
 
 ## Updates of Existing Functions
 
+- The function `extract_duplicate_records()` updated to use all variables if omitted the `by_vars` (#2644)
+
 ## Breaking Changes
 
 - The following function arguments are entering the next phase of the [deprecation process](https://pharmaverse.github.io/admiraldev/articles/programming_strategy.html#deprecation): (#2487) (#2595)

diff --git a/R/duplicates.R b/R/duplicates.R
@@ -39,6 +39,7 @@ get_duplicates_dataset <- function() {
 #' @param by_vars Grouping variables
 #'
 #'  Defines groups of records in which to look for duplicates.
+#'  If omitted, all variables used for defining groups.
 #'
 #' `r roxygen_param_by_vars()`
 #'
@@ -55,9 +56,14 @@ get_duplicates_dataset <- function() {
 #' adsl <- rbind(admiral_adsl[1L, ], admiral_adsl)
 #'
 #' extract_duplicate_records(adsl, exprs(USUBJID))
-extract_duplicate_records <- function(dataset, by_vars) {
-  assert_expr_list(by_vars)
-  assert_data_frame(dataset, required_vars = extract_vars(by_vars), check_is_grouped = FALSE)
+extract_duplicate_records <- function(dataset, by_vars = NULL) {
+  if (is.null(by_vars)) {
+    assert_data_frame(dataset, check_is_grouped = FALSE)
+    by_vars <- exprs(!!!parse_exprs(names(dataset)))
+  } else {
+    assert_expr_list(by_vars)
+    assert_data_frame(dataset, required_vars = extract_vars(by_vars), check_is_grouped = FALSE)
+  }
 
   data_by <- dataset %>%
     ungroup() %>%

diff --git a/man/extract_duplicate_records.Rd b/man/extract_duplicate_records.Rd
diff --git a/tests/testthat/_snaps/duplicates.md b/tests/testthat/_snaps/duplicates.md
@@ -1,4 +1,4 @@
-# signal_duplicate_records Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()`
+# signal_duplicate_records Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()`
 
     Code
       get_duplicates_dataset()

diff --git a/tests/testthat/test-duplicates.R b/tests/testthat/test-duplicates.R
@@ -18,9 +18,29 @@ test_that("extract_duplicate_records Test 1: duplicate records are extracted", {
   )
 })
 
+## Test 2: duplicate records for all variables ----
+test_that("extract_duplicate_records Test 2: duplicate records for all variables", {
+  input <- tibble::tribble(
+    ~USUBJID, ~COUNTRY, ~AAGE,
+    "P01",    "GER",    22,
+    "P01",    "JPN",    34,
+    "P02",    "CZE",    41,
+    "P03",    "AUS",    39,
+    "P04",    "BRA",    21,
+    "P04",    "BRA",    21
+  )
+  expected_ouput <- input[c(5:6), ]
+
+  expect_equal(
+    expected_ouput,
+    extract_duplicate_records(input)
+  )
+})
+
+
 # signal_duplicate_records ----
-## Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()` ----
-test_that("signal_duplicate_records Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()`", { # nolint
+## Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()` ----
+test_that("signal_duplicate_records Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()`", { # nolint
   input <- tibble::tribble(
     ~USUBJID, ~COUNTRY, ~AAGE,
     "P01",    "GER",    22,