Skip to content

Commit

Permalink
Merge pull request #227 from UCD-SERG/add-user-error-message-for-miss…
Browse files Browse the repository at this point in the history
…ing-strata

add error message for missing strata
  • Loading branch information
d-morrison authored Nov 16, 2024
2 parents abb4ffd + 1cddf51 commit 9d08110
Show file tree
Hide file tree
Showing 12 changed files with 452 additions and 99 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ allpopsamples_hlye.csv$
^vignettes/\.quarto$
^vignettes/methodology\.qmd$
^\.quarto$
^man/check_strata\.Rd$
^man/df_to_array\.Rd$
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: serocalculator
Title: Estimating Infection Rates from Serological Data
Version: 1.2.0.9021
Version: 1.2.0.9022
Authors@R: c(
person("Peter", "Teunis", , "p.teunis@emory.edu", role = c("aut", "cph"),
comment = "Author of the method and original code."),
Expand Down Expand Up @@ -39,7 +39,10 @@ Imports:
tidyr,
tidyselect,
utils,
purrr
purrr,
and,
glue,
stringr
Suggests:
bookdown,
devtag (>= 0.0.0.9000),
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

## Internal changes

* Add test for missing strata in `est.incidence.by` (#227)
* Added `snapshot_value` test for `est.incidence()` (#315)

* Sped up `lint-changed-files` GitHub Action (#317)
Expand Down
64 changes: 64 additions & 0 deletions R/check_strata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#' @title Check a `pop_data` object for requested strata variables
#' @param pop_data a `pop_data` object
#' @param strata a [character] vector
#' @returns [NULL], invisibly
#' @examples
#' sees_pop_data_pk_100 |>
#' check_strata(strata = c("ag", "catch", "Count"))
#' @dev
check_strata <- function(pop_data, strata) {
if (!is.character(strata)) {
cli::cli_abort(
class = "strata are not strings",
message = c(
"x" = "Argument `strata` is not a character vector.",
"i" = "Provide a character vector with names of stratifying variables."
)
)
}

present_strata_vars <- intersect(strata, names(pop_data))
missing_strata_vars <- setdiff(strata, present_strata_vars)

if (length(missing_strata_vars) > 0) {
message0 <- c(
"Can't stratify provided {.arg pop_data}
with the provided {.arg strata}:",
"i" = "variable {.var {missing_strata_vars}}
{?is/are} missing in {.arg pop_data}."
)

partial_matches <-
purrr::map(missing_strata_vars, function(x) {
stringr::str_subset(string = names(pop_data), pattern = x) |>
glue::backtick() |>
and::or()
}) |>
rlang::set_names(missing_strata_vars) |>
purrr::keep(~ length(.x) > 0)

inputs_with_partial_matches <- names(partial_matches) # nolint: object_usage_linter

if (length(partial_matches) > 0) {
partial_matches <-
glue::glue("\"{names(partial_matches)}\": {partial_matches}")

message0 <- c(
message0,
"i" = "The following input{?s} to {.arg strata}
might be misspelled:
{.str {inputs_with_partial_matches}}",
"i" = "Did you mean:",
partial_matches |> rlang::set_names("*")
)
}

cli::cli_abort(
class = "missing_var",
call = rlang::caller_env(),
message = message0
)
}

invisible(NULL)
}
169 changes: 97 additions & 72 deletions R/est.incidence.by.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
#' Estimate Seroincidence
#'
#' @description
#' Function to estimate seroincidences based on cross-section serology data and longitudinal
#' Function to estimate seroincidences based on cross-sectional
#' serology data and longitudinal
#' response model.
#'
#' @param pop_data a [data.frame] with cross-sectional serology data per antibody and age, and additional columns corresponding to each element of the `strata` input
#' @param strata a [character] vector of stratum-defining variables. Values must be variable names in `pop_data`.
#' @param curve_strata_varnames A subset of `strata`. Values must be variable names in `curve_params`. Default = "".
#' @param noise_strata_varnames A subset of `strata`. Values must be variable names in `noise_params`. Default = "".
#' @param num_cores Number of processor cores to use for calculations when computing by strata. If set to more than 1 and package \pkg{parallel} is available, then the computations are executed in parallel. Default = 1L.
#' @param pop_data a [data.frame] with cross-sectional serology data per
#' antibody and age, and additional columns corresponding to
#' each element of the `strata` input
#' @param strata a [character] vector of stratum-defining variables.
#' Values must be variable names in `pop_data`.
#' @param curve_strata_varnames A subset of `strata`.
#' Values must be variable names in `curve_params`. Default = "".
#' @param noise_strata_varnames A subset of `strata`.
#' Values must be variable names in `noise_params`. Default = "".
#' @param num_cores Number of processor cores to use for
#' calculations when computing by strata. If set to
#' more than 1 and package \pkg{parallel} is available,
#' then the computations are executed in parallel. Default = 1L.

#' @details
#'
Expand All @@ -17,7 +25,8 @@
#' and then the data will be passed to [est.incidence()].
#' If for some reason you want to use [est.incidence.by()]
#' with no strata instead of calling [est.incidence()],
#' you may use `NA`, `NULL`, or `""` as the `strata` argument to avoid that warning.
#' you may use `NA`, `NULL`, or `""` as the `strata`
#' argument to avoid that warning.
#'
#'
#' @inheritParams est.incidence
Expand All @@ -26,7 +35,9 @@
#'
#' @return
#' * if `strata` has meaningful inputs:
#' An object of class `"seroincidence.by"`; i.e., a list of `"seroincidence"` objects from [est.incidence()], one for each stratum, with some meta-data attributes.
#' An object of class `"seroincidence.by"`; i.e., a list of
#' `"seroincidence"` objects from [est.incidence()], one for each stratum,
#' with some meta-data attributes.
#' * if `strata` is missing, `NULL`, `NA`, or `""`:
#' An object of class `"seroincidence"`.
#'
Expand All @@ -39,7 +50,8 @@
#'
#' curve <- load_curve_params("https://osf.io/download/rtw5k/") %>%
#' filter(antigen_iso %in% c("HlyE_IgA", "HlyE_IgG")) %>%
#' slice(1:100, .by = antigen_iso) # Reduce dataset for the purposes of this example
#' # Reduce dataset for the purposes of this example:
#' slice(1:100, .by = antigen_iso)
#'
#' noise <- load_noise_params("https://osf.io/download//hqy4v/")
#'
Expand All @@ -49,7 +61,7 @@
#' curve_params = curve,
#' noise_params = noise %>% filter(Country == "Pakistan"),
#' antigen_isos = c("HlyE_IgG", "HlyE_IgA"),
#' #num_cores = 8 # Allow for parallel processing to decrease run time
#' # num_cores = 8 # Allow for parallel processing to decrease run time
#' iterlim = 5 # limit iterations for the purpose of this example
#' )
#'
Expand All @@ -71,22 +83,27 @@ est.incidence.by <- function(
verbose = FALSE,
print_graph = FALSE,
...) {
if (missing(strata)) {
warning(
"The `strata` argument to `est.incidence.by()` is missing.",
"\n\n If you do not want to stratify your data, ",
"consider using the `est.incidence()` function to simplify your code and avoid this warning.",
"\n\n Since the `strata` argument is empty, `est.incidence.by()` will return a `seroincidence` object, instead of a `seroincidence.by` object.\n"
)
}

strata_is_empty <-
missing(strata) ||
is.null(strata) ||
setequal(strata, NA) ||
setequal(strata, "")
is.null(strata) ||
setequal(strata, NA) ||
setequal(strata, "")

if (strata_is_empty) {
cli::cli_warn(
class = "strata_empty",
c(
"The {.arg strata} argument to {.fn est.incidence.by} is missing.",
"i" = "If you do not want to stratify your data,
consider using the {.fn est.incidence} function to
simplify your code and avoid this warning.",
"i" = "Since the {.arg strata} argument is empty,
{.fn est.incidence.by} will return a {.cls seroincidence} object,
instead of a {.cls seroincidence.by} object."
)
)

to_return <-
est.incidence(
pop_data = pop_data,
Expand All @@ -101,7 +118,7 @@ est.incidence.by <- function(
return(to_return)
}

.checkStrata(data = pop_data, strata = strata)
check_strata(pop_data, strata = strata)

.errorCheck(
data = pop_data,
Expand All @@ -110,7 +127,7 @@ est.incidence.by <- function(
)

# Split data per stratum
stratumDataList <- stratify_data(
stratum_data_list <- stratify_data(
antigen_isos = antigen_isos,
data = pop_data %>% filter(.data$antigen_iso %in% antigen_isos),
curve_params = curve_params %>% filter(.data$antigen_iso %in% antigen_isos),
Expand All @@ -120,18 +137,25 @@ est.incidence.by <- function(
noise_strata_varnames = noise_strata_varnames
)

strata_table <- stratumDataList %>% attr("strata")
strata_table <- stratum_data_list %>% attr("strata")

if (verbose) {
message("Data has been stratified.")
message("Here are the strata that will be analyzed:")
print(strata_table)
cli::cli_inform(
c(
"i" = "Data has been stratified.",
"i" = "Here are the strata that will be analyzed:",
""
),
body = strata_table |> capture.output()
)
}

if (num_cores > 1L && !requireNamespace("parallel", quietly = TRUE)) {
warning(
"The `parallel` package is not installed, so `num_cores > 1` has no effect.",
"To install `parallel`, run `install.packages('parallel')` in the console."
cli::cli_warn(
"The `parallel` package is not installed,
so `num_cores > 1` has no effect.",
"To install `parallel`, run `install.packages('parallel')`
in the console."
)
}

Expand All @@ -142,11 +166,11 @@ est.incidence.by <- function(
num_cores <- num_cores %>% check_parallel_cores()

if (verbose) {
message("Setting up parallel processing with `num_cores` = ", num_cores, ".")
cli::cli_inform("Setting up parallel processing with
`num_cores` = {num_cores}.")
}


libPaths <- .libPaths()
lib_paths <- .libPaths()
cl <-
num_cores %>%
parallel::makeCluster() %>%
Expand All @@ -155,17 +179,22 @@ est.incidence.by <- function(
parallel::stopCluster(cl)
})

parallel::clusterExport(cl, c("libPaths"), envir = environment())
# Export library paths to the cluster
parallel::clusterExport(cl, "lib_paths", envir = environment())

# Evaluate library loading on the cluster
parallel::clusterEvalQ(cl, {
.libPaths(libPaths)
require(serocalculator) # note - this gets out of sync when using load_all() in development
.libPaths(lib_paths)
# note - this gets out of sync when using load_all() in development
require(serocalculator)
require(dplyr)
})

{
# Perform parallel computation and record execution time
time <- system.time({
fits <- parallel::parLapplyLB(
cl = cl,
X = stratumDataList,
X = stratum_data_list,
fun = function(x) {
do.call(
what = est.incidence,
Expand All @@ -183,63 +212,59 @@ est.incidence.by <- function(
)
}
)
} %>% system.time() -> time
})

if (verbose) {
message("Elapsed time for parallelized code: ")
print(time)
cli::cli_inform(c("i" = "Elapsed time for parallelized code:"),
body = capture.output(time)
)
}
} else {
# fits <- lapply(
# X = stratumDataList,
# FUN = function(x) est.incidence(dataList = x, verbose = verbose, ...))

fits <- list()
# Time progress:
time <- system.time({
fits <- list() # Initialize an empty list for fits

{ # time progress

for (cur_stratum in names(stratumDataList))
{
cur_stratum_vars <-
strata_table %>%
for (cur_stratum in names(stratum_data_list)) {
cur_stratum_vars <- strata_table %>%
dplyr::filter(.data$Stratum == cur_stratum)

if (verbose) {
message("starting new stratum: ", cur_stratum)
cli::cli_inform("starting new stratum: {cur_stratum}")
print(cur_stratum_vars)
}

fits[[cur_stratum]] <-
do.call(
what = est.incidence,
args = c(
stratumDataList[[cur_stratum]],
list(
lambda_start = lambda_start,
antigen_isos = antigen_isos,
build_graph = build_graph,
print_graph = print_graph,
verbose = verbose,
...
)
fits[[cur_stratum]] <- do.call(
what = est.incidence,
args = c(
stratum_data_list[[cur_stratum]],
list(
lambda_start = lambda_start,
antigen_isos = antigen_isos,
build_graph = build_graph,
print_graph = print_graph,
verbose = verbose,
...
)
)
)
}
} %>% system.time() -> time
})

if (verbose) {
message("Elapsed time for loop over strata: ")
print(time)
cli::cli_inform(
c("i" = "Elapsed time for loop over strata: "),
body = capture.output(time)
)
}
}

incidenceData <- structure(
incidence_data <- structure(
fits,
antigen_isos = antigen_isos,
Strata = strata_table,
graphs_included = build_graph,
class = "seroincidence.by" %>% union(class(fits))
)

return(incidenceData)
return(incidence_data)
}
Loading

0 comments on commit 9d08110

Please sign in to comment.