Merge branch 'main' into set-default-pipe-in-project-settings

UCD-SERG · Nov 16, 2024 · ef2e993 · ef2e993
2 parents 1268d8c + 9d08110
commit ef2e993
Show file tree

Hide file tree

Showing 12 changed files with 452 additions and 99 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -29,4 +29,5 @@ allpopsamples_hlye.csv$
 ^vignettes/\.quarto$
 ^vignettes/methodology\.qmd$
 ^\.quarto$
+^man/check_strata\.Rd$
 ^man/df_to_array\.Rd$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: serocalculator
 Title: Estimating Infection Rates from Serological Data
-Version: 1.2.0.9021
+Version: 1.2.0.9022
 Authors@R: c(
     person("Peter", "Teunis", , "p.teunis@emory.edu", role = c("aut", "cph"),
            comment = "Author of the method and original code."),
@@ -39,7 +39,10 @@ Imports:
     tidyr,
     tidyselect,
     utils,
-    purrr
+    purrr,
+    and,
+    glue,
+    stringr
 Suggests:
     bookdown,
     devtag (>= 0.0.0.9000),

diff --git a/NEWS.md b/NEWS.md
@@ -28,6 +28,7 @@
 ## Internal changes
 * Change default pipe setting (#312)
 
+* Add test for missing strata in `est.incidence.by` (#227)
 * Added `snapshot_value` test for `est.incidence()` (#315)
 
 * Sped up `lint-changed-files` GitHub Action (#317)

diff --git a/R/check_strata.R b/R/check_strata.R
@@ -0,0 +1,64 @@
+#' @title Check a `pop_data` object for requested strata variables
+#' @param pop_data a `pop_data` object
+#' @param strata a [character] vector
+#' @returns [NULL], invisibly
+#' @examples
+#' sees_pop_data_pk_100 |>
+#'   check_strata(strata = c("ag", "catch", "Count"))
+#' @dev
+check_strata <- function(pop_data, strata) {
+  if (!is.character(strata)) {
+    cli::cli_abort(
+      class = "strata are not strings",
+      message = c(
+        "x" = "Argument `strata` is not a character vector.",
+        "i" = "Provide a character vector with names of stratifying variables."
+      )
+    )
+  }
+
+  present_strata_vars <- intersect(strata, names(pop_data))
+  missing_strata_vars <- setdiff(strata, present_strata_vars)
+
+  if (length(missing_strata_vars) > 0) {
+    message0 <- c(
+      "Can't stratify provided {.arg pop_data}
+       with the provided {.arg strata}:",
+      "i" = "variable {.var {missing_strata_vars}}
+             {?is/are} missing in {.arg pop_data}."
+    )
+
+    partial_matches <-
+      purrr::map(missing_strata_vars, function(x) {
+        stringr::str_subset(string = names(pop_data), pattern = x) |>
+          glue::backtick() |>
+          and::or()
+      }) |>
+      rlang::set_names(missing_strata_vars) |>
+      purrr::keep(~ length(.x) > 0)
+
+    inputs_with_partial_matches <- names(partial_matches) # nolint: object_usage_linter
+
+    if (length(partial_matches) > 0) {
+      partial_matches <-
+        glue::glue("\"{names(partial_matches)}\": {partial_matches}")
+
+      message0 <- c(
+        message0,
+        "i" = "The following input{?s} to {.arg strata}
+                  might be misspelled:
+                  {.str {inputs_with_partial_matches}}",
+        "i" = "Did you mean:",
+        partial_matches |> rlang::set_names("*")
+      )
+    }
+
+    cli::cli_abort(
+      class = "missing_var",
+      call = rlang::caller_env(),
+      message = message0
+    )
+  }
+
+  invisible(NULL)
+}
diff --git a/R/est.incidence.by.R b/R/est.incidence.by.R
@@ -1,14 +1,22 @@
 #' Estimate Seroincidence
-#'
 #' @description
-#' Function to estimate seroincidences based on cross-section serology data and longitudinal
+#' Function to estimate seroincidences based on cross-sectional
+#' serology data and longitudinal
 #' response model.
 #'
-#' @param pop_data a [data.frame] with cross-sectional serology data per antibody and age, and additional columns corresponding to each element of the `strata` input
-#' @param strata a [character] vector of stratum-defining variables. Values must be variable names in `pop_data`.
-#' @param curve_strata_varnames A subset of `strata`. Values must be variable names in `curve_params`. Default = "".
-#' @param noise_strata_varnames A subset of `strata`. Values must be variable names in `noise_params`. Default = "".
-#' @param num_cores Number of processor cores to use for calculations when computing by strata. If set to more than 1 and package \pkg{parallel} is available, then the computations are executed in parallel. Default = 1L.
+#' @param pop_data a [data.frame] with cross-sectional serology data per
+#' antibody and age, and additional columns corresponding to
+#' each element of the `strata` input
+#' @param strata a [character] vector of stratum-defining variables.
+#' Values must be variable names in `pop_data`.
+#' @param curve_strata_varnames A subset of `strata`.
+#' Values must be variable names in `curve_params`. Default = "".
+#' @param noise_strata_varnames A subset of `strata`.
+#' Values must be variable names in `noise_params`. Default = "".
+#' @param num_cores Number of processor cores to use for
+#' calculations when computing by strata. If set to
+#' more than 1 and package \pkg{parallel} is available,
+#' then the computations are executed in parallel. Default = 1L.
 
 #' @details
 #'
@@ -17,7 +25,8 @@
 #' and then the data will be passed to [est.incidence()].
 #' If for some reason you want to use [est.incidence.by()]
 #' with no strata instead of calling [est.incidence()],
-#' you may use `NA`, `NULL`, or `""` as the `strata` argument to avoid that warning.
+#' you may use `NA`, `NULL`, or `""` as the `strata`
+#' argument to avoid that warning.
 #'
 #'
 #' @inheritParams est.incidence
@@ -26,7 +35,9 @@
 #'
 #' @return
 #' * if `strata` has meaningful inputs:
-#' An object of class `"seroincidence.by"`; i.e., a list of `"seroincidence"` objects from [est.incidence()], one for each stratum, with some meta-data attributes.
+#' An object of class `"seroincidence.by"`; i.e., a list of
+#' `"seroincidence"` objects from [est.incidence()], one for each stratum,
+#' with some meta-data attributes.
 #' * if `strata` is missing, `NULL`, `NA`, or `""`:
 #' An object of class `"seroincidence"`.
 #'
@@ -39,7 +50,8 @@
 #'
 #' curve <- load_curve_params("https://osf.io/download/rtw5k/") %>%
 #'   filter(antigen_iso %in% c("HlyE_IgA", "HlyE_IgG")) %>%
-#'   slice(1:100, .by = antigen_iso) # Reduce dataset for the purposes of this example
+#'  # Reduce dataset for the purposes of this example:
+#'   slice(1:100, .by = antigen_iso)
 #'
 #' noise <- load_noise_params("https://osf.io/download//hqy4v/")
 #'
@@ -49,7 +61,7 @@
 #'   curve_params = curve,
 #'   noise_params = noise %>% filter(Country == "Pakistan"),
 #'   antigen_isos = c("HlyE_IgG", "HlyE_IgA"),
-#'   #num_cores = 8 # Allow for parallel processing to decrease run time
+#'   # num_cores = 8 # Allow for parallel processing to decrease run time
 #'   iterlim = 5 # limit iterations for the purpose of this example
 #' )
 #'
@@ -71,22 +83,27 @@ est.incidence.by <- function(
     verbose = FALSE,
     print_graph = FALSE,
     ...) {
-  if (missing(strata)) {
-    warning(
-      "The `strata` argument to `est.incidence.by()` is missing.",
-      "\n\n  If you do not want to stratify your data, ",
-      "consider using the `est.incidence()` function to simplify your code and avoid this warning.",
-      "\n\n Since the `strata` argument is empty, `est.incidence.by()` will return a `seroincidence` object, instead of a `seroincidence.by` object.\n"
-    )
-  }
 
   strata_is_empty <-
     missing(strata) ||
-      is.null(strata) ||
-      setequal(strata, NA) ||
-      setequal(strata, "")
+    is.null(strata) ||
+    setequal(strata, NA) ||
+    setequal(strata, "")
 
   if (strata_is_empty) {
+    cli::cli_warn(
+      class = "strata_empty",
+      c(
+        "The {.arg strata} argument to {.fn est.incidence.by} is missing.",
+        "i" = "If you do not want to stratify your data,
+               consider using the {.fn est.incidence} function to
+               simplify your code and avoid this warning.",
+        "i" = "Since the {.arg strata} argument is empty,
+               {.fn est.incidence.by} will return a {.cls seroincidence} object,
+               instead of a {.cls seroincidence.by} object."
+      )
+    )
+
     to_return <-
       est.incidence(
         pop_data = pop_data,
@@ -101,7 +118,7 @@ est.incidence.by <- function(
     return(to_return)
   }
 
-  .checkStrata(data = pop_data, strata = strata)
+  check_strata(pop_data, strata = strata)
 
   .errorCheck(
     data = pop_data,
@@ -110,7 +127,7 @@ est.incidence.by <- function(
   )
 
   # Split data per stratum
-  stratumDataList <- stratify_data(
+  stratum_data_list <- stratify_data(
     antigen_isos = antigen_isos,
     data = pop_data %>% filter(.data$antigen_iso %in% antigen_isos),
     curve_params = curve_params %>% filter(.data$antigen_iso %in% antigen_isos),
@@ -120,18 +137,25 @@ est.incidence.by <- function(
     noise_strata_varnames = noise_strata_varnames
   )
 
-  strata_table <- stratumDataList %>% attr("strata")
+  strata_table <- stratum_data_list %>% attr("strata")
 
   if (verbose) {
-    message("Data has been stratified.")
-    message("Here are the strata that will be analyzed:")
-    print(strata_table)
+    cli::cli_inform(
+      c(
+        "i" = "Data has been stratified.",
+        "i" = "Here are the strata that will be analyzed:",
+        ""
+      ),
+      body = strata_table |> capture.output()
+    )
   }
 
   if (num_cores > 1L && !requireNamespace("parallel", quietly = TRUE)) {
-    warning(
-      "The `parallel` package is not installed, so `num_cores > 1` has no effect.",
-      "To install `parallel`, run `install.packages('parallel')` in the console."
+    cli::cli_warn(
+      "The `parallel` package is not installed,
+      so `num_cores > 1` has no effect.",
+      "To install `parallel`, run `install.packages('parallel')`
+      in the console."
     )
   }
 
@@ -142,11 +166,11 @@ est.incidence.by <- function(
     num_cores <- num_cores %>% check_parallel_cores()
 
     if (verbose) {
-      message("Setting up parallel processing with `num_cores` = ", num_cores, ".")
+      cli::cli_inform("Setting up parallel processing with
+              `num_cores` = {num_cores}.")
     }
 
-
-    libPaths <- .libPaths()
+    lib_paths <- .libPaths()
     cl <-
       num_cores %>%
       parallel::makeCluster() %>%
@@ -155,17 +179,22 @@ est.incidence.by <- function(
       parallel::stopCluster(cl)
     })
 
-    parallel::clusterExport(cl, c("libPaths"), envir = environment())
+    # Export library paths to the cluster
+    parallel::clusterExport(cl, "lib_paths", envir = environment())
+
+    # Evaluate library loading on the cluster
     parallel::clusterEvalQ(cl, {
-      .libPaths(libPaths)
-      require(serocalculator) # note - this gets out of sync when using load_all() in development
+      .libPaths(lib_paths)
+      # note - this gets out of sync when using load_all() in development
+      require(serocalculator)
       require(dplyr)
     })
 
-    {
+    # Perform parallel computation and record execution time
+    time <- system.time({
       fits <- parallel::parLapplyLB(
         cl = cl,
-        X = stratumDataList,
+        X = stratum_data_list,
         fun = function(x) {
           do.call(
             what = est.incidence,
@@ -183,63 +212,59 @@ est.incidence.by <- function(
           )
         }
       )
-    } %>% system.time() -> time
+    })
 
     if (verbose) {
-      message("Elapsed time for parallelized code: ")
-      print(time)
+      cli::cli_inform(c("i" = "Elapsed time for parallelized code:"),
+        body = capture.output(time)
+      )
     }
   } else {
-    # fits <- lapply(
-    #   X = stratumDataList,
-    #   FUN = function(x) est.incidence(dataList = x, verbose = verbose, ...))
-
-    fits <- list()
+    # Time progress:
+    time <- system.time({
+      fits <- list() # Initialize an empty list for fits
 
-    { # time progress
-
-      for (cur_stratum in names(stratumDataList))
-      {
-        cur_stratum_vars <-
-          strata_table %>%
+      for (cur_stratum in names(stratum_data_list)) {
+        cur_stratum_vars <- strata_table %>%
           dplyr::filter(.data$Stratum == cur_stratum)
 
         if (verbose) {
-          message("starting new stratum: ", cur_stratum)
+          cli::cli_inform("starting new stratum: {cur_stratum}")
           print(cur_stratum_vars)
         }
 
-        fits[[cur_stratum]] <-
-          do.call(
-            what = est.incidence,
-            args = c(
-              stratumDataList[[cur_stratum]],
-              list(
-                lambda_start = lambda_start,
-                antigen_isos = antigen_isos,
-                build_graph = build_graph,
-                print_graph = print_graph,
-                verbose = verbose,
-                ...
-              )
+        fits[[cur_stratum]] <- do.call(
+          what = est.incidence,
+          args = c(
+            stratum_data_list[[cur_stratum]],
+            list(
+              lambda_start = lambda_start,
+              antigen_isos = antigen_isos,
+              build_graph = build_graph,
+              print_graph = print_graph,
+              verbose = verbose,
+              ...
             )
           )
+        )
       }
-    } %>% system.time() -> time
+    })
 
     if (verbose) {
-      message("Elapsed time for loop over strata: ")
-      print(time)
+      cli::cli_inform(
+        c("i" = "Elapsed time for loop over strata: "),
+        body = capture.output(time)
+      )
     }
   }
 
-  incidenceData <- structure(
+  incidence_data <- structure(
     fits,
     antigen_isos = antigen_isos,
     Strata = strata_table,
     graphs_included = build_graph,
     class = "seroincidence.by" %>% union(class(fits))
   )
 
-  return(incidenceData)
+  return(incidence_data)
 }