Filling NA group values for "overall" calculations during `shuffle_ar…

…d()` (#306) **What changes are proposed in this pull request?** * Style this entry in a way that can be copied directly into `NEWS.md`. (#<issue number>, @<username>) Provide more detail here as needed. **Reference GitHub issue associated with pull request.** _e.g., 'closes #<issue number>'_ Closes #235 -------------------------------------------------------------------------------- Pre-review Checklist (if item does not apply, mark is as complete) - [x] **All** GitHub Action workflows pass with a ✅ - [x] PR branch has pulled the most recent updates from master branch: `usethis::pr_merge_main()` - [x] If a bug was fixed, a unit test was added. - [x] Code coverage is suitable for any new functions/features (generally, 100% coverage for new code): `devtools::test_coverage()` - [x] Request a reviewer Reviewer Checklist (if item does not apply, mark is as complete) - [x] If a bug was fixed, a unit test was added. - [x] Run `pkgdown::build_site()`. Check the R console for errors, and review the rendered website. - [x] Code coverage is suitable for any new functions/features: `devtools::test_coverage()` When the branch is ready to be merged: - [x] Update `NEWS.md` with the changes from this pull request under the heading "`# cards (development version)`". If there is an issue associated with the pull request, reference it in parentheses at the end update (see `NEWS.md` for examples). - [x] **All** GitHub Action workflows pass with a ✅ - [x] Approve Pull Request - [x] Merge the PR. Please use "Squash and merge" or "Rebase and merge". --------- Signed-off-by: Daniel Sjoberg <danield.sjoberg@gmail.com> Co-authored-by: Daniel Sjoberg <danield.sjoberg@gmail.com>
insightsengineering · Aug 30, 2024 · 35e1698 · 35e1698
1 parent 4c1b715
commit 35e1698
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 0 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,8 @@
 
 * Added the `bind_ard(.quiet)` argument to suppress messaging. (#299)
 
+* Improved ability of `shuffle_ard()` to populate missing group values where possible. (#306)
+
 * Added `apply_fmt_fn(replace)` argument. Use `replace=FALSE` to retain any previously formatted statistics in the `stat_fmt` column. (#285)
 
 * Added `bind_ard(.distinct)` argument, which can remove non-distinct rows from the ARD across grouping variables, primary variables, context, statistic name and value. (#286)

diff --git a/R/shuffle_ard.R b/R/shuffle_ard.R
@@ -77,6 +77,7 @@ shuffle_ard <- function(x, trim = TRUE) {
   }
 
   dat_cards_out <- dat_cards_out |>
+    .fill_overall_grp_values(vars_protected) |>
     dplyr::rename(any_of(c(label = "variable_level"))) |>
     dplyr::arrange(".cards_idx") |>
     dplyr::select(-".cards_idx")
@@ -269,6 +270,65 @@ shuffle_ard <- function(x, trim = TRUE) {
     dplyr::mutate(variable = as.character(.data$variable))
 }
 
+
+#' Fill Overall Group Variables
+#'
+#' This function fills the missing values of grouping variables with "Overall
+#' <variable name>" where relevant. Specifically it will modify grouping values
+#' from rows with likely overall calculations present (e.g. non-missing
+#' variable/variable_level, 100% missing group variables, and evidence that the
+#' `variable` has been computed by group in other rows). "Overall" values will
+#' be populated only for grouping variables that have been used in other calculations
+#' of the same variable and statistics.
+#'
+#' @param x (`data.frame`)\cr
+#'   a data frame
+#'
+#' @return data frame
+#' @keywords internal
+#'
+#' @examples
+#' data <- dplyr::tibble(
+#'   grp = c("AA", "AA", NA, "BB", NA),
+#'   variable = c("A", "B", "A", "C", "C"),
+#'   variable_level = c(1, 2, 1, 3, 3),
+#'   A = rep(NA, 5),
+#'   B = rep(NA, 5),
+#'   .cards_idx = c(1:5)
+#' )
+#'
+#' cards:::.fill_overall_grp_values(data, vars_protected = ".cards_idx")
+.fill_overall_grp_values <- function(x, vars_protected) {
+  # determine grouping and merging variables
+  id_vars <- c("variable", "variable_level", "stat_name", "stat_label")
+  id_vars <- id_vars[id_vars %in% names(x)]
+  grp_vars <- setdiff(names(x), unique(c(vars_protected, id_vars)))
+
+  # replace NA group values with "Overall <var>" where it is likely to be an overall calculation
+  x_missing_by <- x |>
+    dplyr::filter(dplyr::if_all(all_of(grp_vars), ~ is.na(.))) |> # all NA grouping values
+    dplyr::rows_update(
+      x |>
+        dplyr::filter(dplyr::if_any(all_of(grp_vars), ~ !is.na(.))) |>
+        dplyr::mutate(dplyr::across(all_of(grp_vars), function(v, cur_col = dplyr::cur_column()) {
+          overall_val <- make.unique(c(
+            unique(v),
+            paste("Overall", cur_col)
+          )) |>
+            rev() %>%
+            .[1]
+          ifelse(!is.na(v), overall_val, v)
+        })) |>
+        dplyr::select(-any_of(c(setdiff(names(x), c(grp_vars, id_vars))))) |>
+        dplyr::distinct(),
+      by = id_vars,
+      unmatched = "ignore"
+    )
+
+  # replace the modified rows based on indices
+  dplyr::rows_update(x, x_missing_by, by = ".cards_idx")
+}
+
 #' List Column as a Vector Predicate
 #'
 #' A predicate function to check whether a column is a list and can be

diff --git a/man/dot-fill_overall_grp_values.Rd b/man/dot-fill_overall_grp_values.Rd
diff --git a/tests/testthat/_snaps/shuffle_ard.md b/tests/testthat/_snaps/shuffle_ard.md
@@ -432,3 +432,72 @@
       1 Overall ARM AGE      <NA>       p          0.05
       2 <NA>        AGE      continuous mean      75.1 
 
+---
+
+    Code
+      as.data.frame(shuffle_ard(bind_ard(ard_categorical(ADSL, by = ARM, variables = AGEGR1),
+      ard_categorical(ADSL, variables = AGEGR1), ard_continuous(ADSL, by = SEX, variables = AGE),
+      ard_continuous(ADSL, variables = AGE))))
+    Output
+                          ARM         SEX variable  label     context stat_name        stat
+      1               Placebo        <NA>   AGEGR1  65-80 categorical         n  42.0000000
+      2               Placebo        <NA>   AGEGR1  65-80 categorical         N  86.0000000
+      3               Placebo        <NA>   AGEGR1  65-80 categorical         p   0.4883721
+      4               Placebo        <NA>   AGEGR1    <65 categorical         n  14.0000000
+      5               Placebo        <NA>   AGEGR1    <65 categorical         N  86.0000000
+      6               Placebo        <NA>   AGEGR1    <65 categorical         p   0.1627907
+      7               Placebo        <NA>   AGEGR1    >80 categorical         n  30.0000000
+      8               Placebo        <NA>   AGEGR1    >80 categorical         N  86.0000000
+      9               Placebo        <NA>   AGEGR1    >80 categorical         p   0.3488372
+      10 Xanomeline High Dose        <NA>   AGEGR1  65-80 categorical         n  55.0000000
+      11 Xanomeline High Dose        <NA>   AGEGR1  65-80 categorical         N  84.0000000
+      12 Xanomeline High Dose        <NA>   AGEGR1  65-80 categorical         p   0.6547619
+      13 Xanomeline High Dose        <NA>   AGEGR1    <65 categorical         n  11.0000000
+      14 Xanomeline High Dose        <NA>   AGEGR1    <65 categorical         N  84.0000000
+      15 Xanomeline High Dose        <NA>   AGEGR1    <65 categorical         p   0.1309524
+      16 Xanomeline High Dose        <NA>   AGEGR1    >80 categorical         n  18.0000000
+      17 Xanomeline High Dose        <NA>   AGEGR1    >80 categorical         N  84.0000000
+      18 Xanomeline High Dose        <NA>   AGEGR1    >80 categorical         p   0.2142857
+      19  Xanomeline Low Dose        <NA>   AGEGR1  65-80 categorical         n  47.0000000
+      20  Xanomeline Low Dose        <NA>   AGEGR1  65-80 categorical         N  84.0000000
+      21  Xanomeline Low Dose        <NA>   AGEGR1  65-80 categorical         p   0.5595238
+      22  Xanomeline Low Dose        <NA>   AGEGR1    <65 categorical         n   8.0000000
+      23  Xanomeline Low Dose        <NA>   AGEGR1    <65 categorical         N  84.0000000
+      24  Xanomeline Low Dose        <NA>   AGEGR1    <65 categorical         p   0.0952381
+      25  Xanomeline Low Dose        <NA>   AGEGR1    >80 categorical         n  29.0000000
+      26  Xanomeline Low Dose        <NA>   AGEGR1    >80 categorical         N  84.0000000
+      27  Xanomeline Low Dose        <NA>   AGEGR1    >80 categorical         p   0.3452381
+      28          Overall ARM        <NA>   AGEGR1  65-80 categorical         n 144.0000000
+      29          Overall ARM        <NA>   AGEGR1  65-80 categorical         N 254.0000000
+      30          Overall ARM        <NA>   AGEGR1  65-80 categorical         p   0.5669291
+      31          Overall ARM        <NA>   AGEGR1    <65 categorical         n  33.0000000
+      32          Overall ARM        <NA>   AGEGR1    <65 categorical         N 254.0000000
+      33          Overall ARM        <NA>   AGEGR1    <65 categorical         p   0.1299213
+      34          Overall ARM        <NA>   AGEGR1    >80 categorical         n  77.0000000
+      35          Overall ARM        <NA>   AGEGR1    >80 categorical         N 254.0000000
+      36          Overall ARM        <NA>   AGEGR1    >80 categorical         p   0.3031496
+      37                 <NA>           F      AGE      N  continuous         N 143.0000000
+      38                 <NA>           F      AGE   Mean  continuous      mean  75.6503497
+      39                 <NA>           F      AGE     SD  continuous        sd   8.1933146
+      40                 <NA>           F      AGE Median  continuous    median  77.0000000
+      41                 <NA>           F      AGE     Q1  continuous       p25  72.0000000
+      42                 <NA>           F      AGE     Q3  continuous       p75  81.0000000
+      43                 <NA>           F      AGE    Min  continuous       min  54.0000000
+      44                 <NA>           F      AGE    Max  continuous       max  89.0000000
+      45                 <NA>           M      AGE      N  continuous         N 111.0000000
+      46                 <NA>           M      AGE   Mean  continuous      mean  74.3603604
+      47                 <NA>           M      AGE     SD  continuous        sd   8.2943494
+      48                 <NA>           M      AGE Median  continuous    median  77.0000000
+      49                 <NA>           M      AGE     Q1  continuous       p25  69.0000000
+      50                 <NA>           M      AGE     Q3  continuous       p75  81.0000000
+      51                 <NA>           M      AGE    Min  continuous       min  51.0000000
+      52                 <NA>           M      AGE    Max  continuous       max  88.0000000
+      53                 <NA> Overall SEX      AGE      N  continuous         N 254.0000000
+      54                 <NA> Overall SEX      AGE   Mean  continuous      mean  75.0866142
+      55                 <NA> Overall SEX      AGE     SD  continuous        sd   8.2462339
+      56                 <NA> Overall SEX      AGE Median  continuous    median  77.0000000
+      57                 <NA> Overall SEX      AGE     Q1  continuous       p25  70.0000000
+      58                 <NA> Overall SEX      AGE     Q3  continuous       p75  81.0000000
+      59                 <NA> Overall SEX      AGE    Min  continuous       min  51.0000000
+      60                 <NA> Overall SEX      AGE    Max  continuous       max  89.0000000
+
diff --git a/tests/testthat/test-ard_stack.R b/tests/testthat/test-ard_stack.R
@@ -108,6 +108,7 @@ test_that("ard_stack() adding overalls", {
 })
 
 
+
 test_that("ard_stack() adding missing/attributes", {
   expect_error(
     ard_test <- ard_stack(
@@ -188,8 +189,36 @@ test_that("ard_stack() .shuffle argument", {
     ) |>
       shuffle_ard()
   )
+
+
+  # with overalls
+  expect_error(
+    ard_test <- ard_stack(
+      data = mtcars,
+      .by = "cyl",
+      ard_continuous(variables = "mpg"),
+      ard_dichotomous(variables = "vs"),
+      .shuffle = TRUE,
+      .overall = TRUE
+    ),
+    NA
+  )
+
+  expect_equal(
+    ard_test,
+    bind_ard(
+      ard_continuous(data = mtcars, by = "cyl", variables = "mpg"),
+      ard_dichotomous(data = mtcars, by = "cyl", variables = "vs"),
+      ard_categorical(data = mtcars, variables = "cyl"),
+      ard_continuous(data = mtcars, variables = "mpg"),
+      ard_dichotomous(data = mtcars, variables = "vs"),
+      .order = TRUE
+    ) |>
+      shuffle_ard()
+  )
 })
 
+
 test_that("ard_stack() adding total N", {
   expect_equal(
     ard_stack(

diff --git a/tests/testthat/test-shuffle_ard.R b/tests/testthat/test-shuffle_ard.R
@@ -92,6 +92,19 @@ test_that("shuffle_ard fills missing group levels if the group is meaningful", {
     ) |>
       shuffle_ard()
   )
+
+  # mix of group variables - fills overall only if variable has been calculated by group elsewhere
+  withr::local_options(list(width = 90))
+  expect_snapshot(
+    bind_ard(
+      ard_categorical(ADSL, by = ARM, variables = AGEGR1),
+      ard_categorical(ADSL, variables = AGEGR1),
+      ard_continuous(ADSL, by = SEX, variables = AGE),
+      ard_continuous(ADSL, variables = AGE)
+    ) |>
+      shuffle_ard() |>
+      as.data.frame()
+  )
 })
 
 test_that("shuffle_ard doesn't trim off NULL/NA values", {