Closes #2480 derive pair of variables (#2503)

* initial draft * now using exprs * added unit tests * added manual and namespace * lint and style * udpate manual * styler * update manual * fix select * fix example * update news.md * fix VSTEST * fix example * fix documentation * update filter * update manual * added keyword and family * update ad_advs.R template * updated functionality allows for specifyying by_vars * show recursive function * fix bug and improve documentation * updated tests * add warning for forgotten by_vars * style & lint * update manual * added helper documentation * update * half way there :) * clean up ad_adpc * UPDATE VIGNETTES * fix error * styler * Update vignettes/adsl.Rmd Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update vignettes/bds_exposure.Rmd Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update R/derive_vars_cat.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update R/derive_vars_cat.R * Update NEWS.md * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update inst/templates/ad_advs.R * update * Update vignettes/bds_finding.Rmd * update style of tibbles * update error message for definition * Simplify assertions. Update test formatting. * updated example * add alternative way * improve wording * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update NEWS.md * Update vignettes/generic.Rmd * update man * remove reliance on assertthat * styler * Update vignettes/bds_finding.Rmd Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update vignettes/bds_exposure.Rmd Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update vignettes/adsl.Rmd Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * update format * update format * format * Update R/derive_vars_cat.R * style & spelling * update tests to tibbles * fix template * switch function * fix alignment * tryout * add ::: to internal function * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * Update R/derive_vars_cat.R Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com> * aligned again * update manual * Update R/derive_vars_cat.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update R/derive_vars_cat.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update R/derive_vars_cat.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update R/derive_vars_cat.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * added test for error * lint * update to cli_warn * fix style --------- Co-authored-by: Ben Straub <ben.x.straub@gsk.com> Co-authored-by: Stefan Bundfuss <80953585+bundfussr@users.noreply.github.com>
pharmaverse · Sep 30, 2024 · 77666b1 · 77666b1
1 parent 680074a
commit 77666b1
Show file tree

Hide file tree

Showing 29 changed files with 1,162 additions and 353 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -100,6 +100,7 @@ export(derive_var_trtdurd)
 export(derive_var_trtemfl)
 export(derive_vars_aage)
 export(derive_vars_atc)
+export(derive_vars_cat)
 export(derive_vars_computed)
 export(derive_vars_crit_flag)
 export(derive_vars_dt)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 ## New Features
 
+- New function `derive_vars_cat()` for deriving pairs of variables or more, e.g. 
+`AVALCATx` & `AVALCAxN`. (#2480)
 - New function `derive_vars_crit_flag()` for deriving criterion flag variables
 (`CRITy`, `CRITyFL`, `CRITyFLN`). (#2468)
 

diff --git a/R/derive_vars_cat.R b/R/derive_vars_cat.R
@@ -0,0 +1,258 @@
+#' Derive Categorization Variables Like `AVALCATy` and `AVALCAyN`
+#' @param dataset
+#' `r roxygen_param_dataset(expected_vars = c("by_vars", "definition"))`
+#' @param definition List of expressions created by `exprs()`.
+#' Must be in rectangular format and specified using the same syntax as when creating
+#' a `tibble` using the `tribble()` function.
+#' The `definition` object will be converted to a `tibble` using `tribble()` inside this function.
+#'
+#' Must contain:
+#'  - the column `condition` which will be converted to a logical expression and
+#'  will be used on the `dataset` input.
+#'  - at least one additional column with the new column name and
+#'  the category value(s) used by the logical expression.
+#'  - the column specified in `by_vars` (if `by_vars` is specified)
+#'
+#' e.g. if `by_vars` is not specified:
+#'
+#' ```{r}
+#' #| eval: false
+#' exprs(~condition,   ~AVALCAT1, ~AVALCA1N,
+#'       AVAL >= 140, ">=140 cm",         1,
+#'       AVAL < 140,   "<140 cm",         2)
+#' ```
+#'
+#' e.g. if `by_vars` is specified as `exprs(VSTEST)`:
+#'
+#' ```{r}
+#' #| eval: false
+#' exprs(~VSTEST,   ~condition,  ~AVALCAT1, ~AVALCA1N,
+#'       "Height", AVAL >= 140, ">=140 cm",         1,
+#'       "Height",  AVAL < 140,  "<140 cm",         2)
+#' ```
+#'
+#' @param by_vars list of expressions with one element. `NULL` by default.
+#' Allows for specifying by groups, e.g. `exprs(PARAMCD)`.
+#' Variable must be present in both `dataset` and `definition`.
+#' The conditions in `definition` are applied only to those records that match `by_vars`.
+#' The categorization variables are set to `NA` for records
+#' not matching any of the by groups in `definition`.
+#'
+#'
+#' @details
+#' If conditions are overlapping, the row order of `definitions` must be carefully considered.
+#' The **first** match will determine the category.
+#' i.e. if
+#'
+#' `AVAL = 155`
+#'
+#' and the `definition` is:
+#'
+#' ```{r}
+#' #| eval: false
+#' definition <- exprs(
+#'   ~VSTEST,   ~condition,  ~AVALCAT1, ~AVALCA1N,
+#'   "Height",  AVAL > 170,  ">170 cm",         1,
+#'   "Height", AVAL <= 170, "<=170 cm",         2,
+#'   "Height", AVAL <= 160, "<=160 cm",         3
+#' )
+#' ```
+#' then `AVALCAT1` will be `"<=170 cm"`, as this is the first match for `AVAL`.
+#' If you specify:
+#'
+#' ```{r}
+#' #| eval: false
+#' definition <- exprs(
+#'   ~VSTEST,   ~condition,  ~AVALCAT1, ~AVALCA1N,
+#'   "Height", AVAL <= 160, "<=160 cm",         3,
+#'   "Height", AVAL <= 170, "<=170 cm",         2,
+#'   "Height",  AVAL > 170,  ">170 cm",         1
+#' )
+#' ```
+#'
+#' Then `AVAL <= 160` will lead to `AVALCAT1 == "<=160 cm"`,
+#' `AVAL` in-between `160` and `170` will lead to `AVALCAT1 == "<=170 cm"`,
+#' and `AVAL <= 170` will lead to `AVALCAT1 == ">170 cm"`.
+#'
+#' However, we suggest to be more explicit when defining the `condition`, to avoid overlap.
+#' In this case, the middle condition should be:
+#' `AVAL <= 170 & AVAL > 160`
+#'
+#' @return The input dataset with the new variables defined in `definition` added
+#' @family der_gen
+#' @keywords der_gen
+#' @export
+#'
+#' @examples
+#' library(dplyr)
+#' library(tibble)
+#'
+#' advs <- tibble::tribble(
+#'   ~USUBJID,       ~VSTEST,  ~AVAL,
+#'   "01-701-1015", "Height", 147.32,
+#'   "01-701-1015", "Weight",  53.98,
+#'   "01-701-1023", "Height", 162.56,
+#'   "01-701-1023", "Weight",     NA,
+#'   "01-701-1028", "Height",     NA,
+#'   "01-701-1028", "Weight",     NA,
+#'   "01-701-1033", "Height", 175.26,
+#'   "01-701-1033", "Weight",  88.45
+#' )
+#'
+#' definition <- exprs(
+#'   ~condition,                        ~AVALCAT1, ~AVALCA1N,  ~NEWCOL,
+#'   VSTEST == "Height" & AVAL > 160,   ">160 cm",         1, "extra1",
+#'   VSTEST == "Height" & AVAL <= 160, "<=160 cm",         2, "extra2"
+#' )
+#' derive_vars_cat(
+#'   dataset = advs,
+#'   definition = definition
+#' )
+#'
+#' # Using by_vars:
+#' definition2 <- exprs(
+#'   ~VSTEST,   ~condition,  ~AVALCAT1, ~AVALCA1N,
+#'   "Height",  AVAL > 160,  ">160 cm",         1,
+#'   "Height", AVAL <= 160, "<=160 cm",         2,
+#'   "Weight",   AVAL > 70,   ">70 kg",         1,
+#'   "Weight",  AVAL <= 70,  "<=70 kg",         2
+#' )
+#'
+#' derive_vars_cat(
+#'   dataset = advs,
+#'   definition = definition2,
+#'   by_vars = exprs(VSTEST)
+#' )
+#'
+#' # With three conditions:
+#' definition3 <- exprs(
+#'   ~VSTEST,                ~condition,  ~AVALCAT1, ~AVALCA1N,
+#'   "Height",               AVAL > 170,  ">170 cm",         1,
+#'   "Height", AVAL <= 170 & AVAL > 160, "<=170 cm",         2,
+#'   "Height",              AVAL <= 160, "<=160 cm",         3
+#' )
+#'
+#' derive_vars_cat(
+#'   dataset = advs,
+#'   definition = definition3,
+#'   by_vars = exprs(VSTEST)
+#' )
+#'
+#' # Let's derive both the MCRITyML and the MCRITyMN variables
+#' adlb <- tibble::tribble(
+#'   ~USUBJID,     ~PARAM, ~AVAL, ~AVALU,  ~ANRHI,
+#'   "01-701-1015", "ALT",   150,  "U/L",      40,
+#'   "01-701-1023", "ALT",    70,  "U/L",      40,
+#'   "01-701-1036", "ALT",   130,  "U/L",      40,
+#'   "01-701-1048", "ALT",    30,  "U/L",      40,
+#'   "01-701-1015", "AST",    50,  "U/L",      35
+#' )
+#'
+#' definition_mcrit <- exprs(
+#'   ~PARAM,                      ~condition,    ~MCRIT1ML, ~MCRIT1MN,
+#'   "ALT",                    AVAL <= ANRHI,    "<=ANRHI",         1,
+#'   "ALT", ANRHI < AVAL & AVAL <= 3 * ANRHI, ">1-3*ANRHI",         2,
+#'   "ALT",                 3 * ANRHI < AVAL,   ">3*ANRHI",         3
+#' )
+#'
+#' adlb %>%
+#'   derive_vars_cat(
+#'     definition = definition_mcrit,
+#'     by_vars = exprs(PARAM)
+#'   )
+derive_vars_cat <- function(dataset,
+                            definition,
+                            by_vars = NULL) {
+  assert_expr_list(definition)
+  assert_vars(by_vars, optional = TRUE)
+  if (length(by_vars) > 1) {
+    cli_abort("{.arg by_vars} must contain just one variable, e.g. {.code exprs(PARAMCD)}")
+  }
+
+  assert_data_frame(dataset,
+    required_vars = c(
+      admiraldev::extract_vars(definition) %>% unique(),
+      by_vars
+    )
+  )
+
+  # transform definition to tibble
+  names(definition) <- NULL
+  definition <- tryCatch(
+    {
+      tibble::tribble(!!!definition)
+    },
+    error = function(e) {
+      # Catch the error and append your own message
+      cli_abort(
+        c(
+          paste(
+            "Failed to convert {.arg definition} to {.cls tibble}.",
+            "{.arg definition} should be specified similarly to how you would",
+            "specify a {.cls tibble} using the {.fun tibble::tribble} function so it",
+            "can be converted to {.cls tibble} using {.fun tibble::tribble}."
+          ),
+          e$message
+        )
+      )
+    }
+  )
+  assert_data_frame(definition, required_vars = c(exprs(condition), by_vars))
+  if (!is.null(by_vars)) {
+    # add condition
+    definition <- definition %>%
+      mutate(
+        condition = extend_condition(as.character(condition),
+          as.character(by_vars),
+          is = !!sym(as.character(by_vars))
+        ) %>%
+          parse_exprs()
+      ) %>%
+      select(-by_vars[[1]])
+  }
+
+  # extract new variable names and conditions
+  new_col_names <- names(definition)[!names(definition) == "condition"]
+  condition <- definition[["condition"]]
+
+  # warn if new variables already exist
+  if (any(new_col_names %in% names(dataset))) {
+    cli_warn(paste(
+      "Column(s) in {.arg definition} already exist in {.arg dataset}.",
+      "Did you forget to specify {.arg by_vars},",
+      "or are you rerunning your code?"
+    ))
+  }
+
+  # (re)apply the function for each new variable name and iteratively derive the categories
+  new_dataset <- reduce(new_col_names, function(.data, col_name) {
+    # extract conditions
+    values <- definition[[col_name]]
+
+    .data %>%
+      mutate(!!sym(col_name) := eval(rlang::call2(
+        "case_when",
+        !!!map2(condition, values, ~ expr(!!.x ~ !!.y))
+      )))
+  }, .init = dataset)
+
+  return(new_dataset)
+}
+
+#' Extend a condition string by adding a new condition based on a variable and its value
+#'
+#' This internal helper function extends a condition string by appending a new condition
+#' that checks if a variable equals a specific value.
+#'
+#' @param cond A character string representing an existing condition.
+#' @param var A character string representing the name of the variable to check.
+#' @param is A character string representing the value the variable should be equal to.
+#'
+#' @return A character string representing the extended condition.
+#' @examples
+#' # Extend an existing condition to include a check for 'AGE == "30"'
+#' admiral:::extend_condition("SEX == 'M'", "AGE", "30")
+#' @keywords internal
+extend_condition <- function(cond, var, is) {
+  paste(cond, " & ", var, " == '", is, "'", sep = "")
+}
diff --git a/inst/templates/ad_adeg.R b/inst/templates/ad_adeg.R
@@ -31,67 +31,45 @@ eg <- convert_blanks_to_na(eg)
 
 # Assign PARAMCD, PARAM, and PARAMN
 param_lookup <- tibble::tribble(
-  ~EGTESTCD, ~PARAMCD, ~PARAM, ~PARAMN,
-  "ECGINT", "EGINTP", "ECG Interpretation", 1,
-  "HR", "HR", "Heart Rate (beats/min)", 2,
-  "RR", "RR", "RR Duration (msec)", 3,
-  "RRR", "RRR", "RR Duration Rederived (msec)", 4,
-  "QT", "QT", "QT Duration (msec)", 10,
-  "QTCBR", "QTCBR", "QTcB - Bazett's Correction Formula Rederived (msec)", 11,
-  "QTCFR", "QTCFR", "QTcF - Fridericia's Correction Formula Rederived (msec)", 12,
-  "QTLCR", "QTLCR", "QTlc - Sagie's Correction Formula Rederived (msec)", 13,
+  ~EGTESTCD, ~PARAMCD,                                                    ~PARAM, ~PARAMN,
+  "ECGINT",  "EGINTP",                                      "ECG Interpretation",       1,
+  "HR",          "HR",                                  "Heart Rate (beats/min)",       2,
+  "RR",          "RR",                                      "RR Duration (msec)",       3,
+  "RRR",        "RRR",                            "RR Duration Rederived (msec)",       4,
+  "QT",          "QT",                                      "QT Duration (msec)",      10,
+  "QTCBR",    "QTCBR",     "QTcB - Bazett's Correction Formula Rederived (msec)",      11,
+  "QTCFR",    "QTCFR", "QTcF - Fridericia's Correction Formula Rederived (msec)",      12,
+  "QTLCR",    "QTLCR",      "QTlc - Sagie's Correction Formula Rederived (msec)",      13,
 )
 
 range_lookup <- tibble::tribble(
   ~PARAMCD, ~ANRLO, ~ANRHI,
-  "EGINTP", NA, NA,
-  "HR", 40, 100,
-  "RR", 600, 1500,
-  "QT", 350, 450,
-  "RRR", 600, 1500,
-  "QTCBR", 350, 450,
-  "QTCFR", 350, 450,
-  "QTLCR", 350, 450,
+  "EGINTP",     NA,     NA,
+  "HR",         40,    100,
+  "RR",        600,   1500,
+  "QT",        350,    450,
+  "RRR",       600,   1500,
+  "QTCBR",     350,    450,
+  "QTCFR",     350,    450,
+  "QTLCR",     350,    450
 )
 
-# ASSIGN AVALCAT1
-avalcat_lookup <- tibble::tribble(
-  ~AVALCA1N, ~AVALCAT1,
-  1, "<= 450 msec",
-  2, ">450<=480 msec",
-  3, ">480<=500 msec",
-  4, ">500 msec"
+# Assign AVALCAx
+avalcax_lookup <- exprs(
+  ~condition,                                                  ~AVALCAT1, ~AVALCA1N,
+  startsWith(PARAMCD, "QT") & AVAL <= 450,                 "<= 450 msec",         1,
+  startsWith(PARAMCD, "QT") & AVAL > 450 & AVAL <= 480, ">450<=480 msec",         2,
+  startsWith(PARAMCD, "QT") & AVAL > 480 & AVAL <= 500, ">480<=500 msec",         3,
+  startsWith(PARAMCD, "QT") & AVAL > 500,                    ">500 msec",         4
 )
-
-# ASSIGN CHGCAT1
-chgcat_lookup <- tibble::tribble(
-  ~CHGCAT1N, ~CHGCAT1,
-  1, "<= 30 msec",
-  2, ">30<=60 msec",
-  3, ">60 msec"
+# Assign CHGCAx
+chgcax_lookup <- exprs(
+  ~condition,                                             ~CHGCAT1, ~CHGCAT1N,
+  startsWith(PARAMCD, "QT") & CHG <= 30,              "<= 30 msec",         1,
+  startsWith(PARAMCD, "QT") & CHG > 30 & CHG <= 60, ">30<=60 msec",         2,
+  startsWith(PARAMCD, "QT") & CHG > 60,                 ">60 msec",         3
 )
 
-# Here are some examples of how you can create your own functions that
-#  operates on vectors, which can be used in `mutate()`. Info then used for
-# lookup table
-format_avalca1n <- function(paramcd, aval) {
-  case_when(
-    str_detect(paramcd, "QT") & aval <= 450 ~ 1,
-    str_detect(paramcd, "QT") & aval > 450 & aval <= 480 ~ 2,
-    str_detect(paramcd, "QT") & aval > 480 & aval <= 500 ~ 3,
-    str_detect(paramcd, "QT") & aval > 500 ~ 4
-  )
-}
-
-format_chgcat1n <- function(paramcd, chg) {
-  case_when(
-    str_detect(paramcd, "QT") & chg <= 30 ~ 1,
-    str_detect(paramcd, "QT") & chg > 30 & chg <= 60 ~ 2,
-    str_detect(paramcd, "QT") & chg > 60 ~ 3
-  )
-}
-
-
 # Derivations ----
 
 # Get list of ADSL vars required for derivations
@@ -316,14 +294,13 @@ adeg <- adeg %>%
     check_type = "error"
   ) %>%
   # Derive AVALCA1N and AVALCAT1
-  mutate(AVALCA1N = format_avalca1n(param = PARAMCD, aval = AVAL)) %>%
-  derive_vars_merged(
-    dataset_add = avalcat_lookup,
-    by_vars = exprs(AVALCA1N)
+  derive_vars_cat(
+    definition = avalcax_lookup
   ) %>%
   # Derive CHGCAT1N and CHGCAT1
-  mutate(CHGCAT1N = format_chgcat1n(param = PARAMCD, chg = CHG)) %>%
-  derive_vars_merged(dataset_add = chgcat_lookup, by_vars = exprs(CHGCAT1N)) %>%
+  derive_vars_cat(
+    definition = chgcax_lookup
+  ) %>%
   # Derive PARAM and PARAMN
   derive_vars_merged(
     dataset_add = select(param_lookup, -EGTESTCD),