insightsengineering · shajoezhu · Jan 5, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 3, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -38,6 +38,7 @@ Imports:
     labeling,
     lifecycle (>= 0.2.0),
     magrittr (>= 1.5),
+    MASS (>= 7.3-60),
     methods,
     Rdpack (>= 2.4),
     rlang (>= 1.1.0),

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,7 @@
 
 ### New Features
 * Refactored `g_forest` to output a `ggplot` object instead of a `grob` object.
+* Added `h_glm_negbin` to `h_glm_count` to enable count data analysis using a negative binomial model.
 
 ### Bug Fixes
 * Fixed nested column split label overlay issue in `rtable2gg` to clean up appearance of text labels.

diff --git a/R/summarize_glm_count.R b/R/summarize_glm_count.R
@@ -131,6 +131,50 @@ h_glm_quasipoisson <- function(.var,
   )
 }
 
+#' @describeIn h_glm_count Helper function to return results of a negative binomial model.
+#'
+#' @inheritParams summarize_glm_count
+#'
+#' @return
+#' * `h_glm_negbin()` returns the results of a Negative Binomial model.
+#'
+#' @keywords internal
+h_glm_negbin <- function(.var,
+                         .df_row,
+                         variables,
+                         weights) {
+  arm <- variables$arm
+  covariates <- variables$covariates
+
+  formula <- stats::as.formula(paste0(
+    .var, " ~ ",
+    " + ",
+    paste(covariates, collapse = " + "),
+    " + ",
+    arm
+  ))
+
+  glm_fit <- MASS::glm.nb(
+    formula = formula,
+    data = .df_row,
+    link = "log"
+  )
+
+  emmeans_fit <- emmeans::emmeans(
+    glm_fit,
+    specs = arm,
+    data = .df_row,
+    type = "response",
+    offset = 0,
+    weights = weights
+  )
+
+  list(
+    glm_fit = glm_fit,
+    emmeans_fit = emmeans_fit
+  )
+}
+
 #' @describeIn h_glm_count Helper function to return the results of the
 #'   selected model (poisson, quasipoisson, negative binomial).
 #'
@@ -145,7 +189,7 @@ h_glm_quasipoisson <- function(.var,
 #'     `"X1"`), and/or interaction terms indicated by `"X1 * X2"`.
 #'   * `offset` (`numeric`)\cr a numeric vector or scalar adding an offset.
 #' @param distribution (`character`)\cr a character value specifying the distribution
-#'   used in the regression (poisson, quasipoisson).
+#'   used in the regression (poisson, quasipoisson, negative binomial).
 #'
 #' @return
 #' * `h_glm_count()` returns the results of the selected model.
@@ -156,13 +200,11 @@ h_glm_count <- function(.var,
                         variables,
                         distribution,
                         weights) {
-  if (distribution == "negbin") {
-    stop("negative binomial distribution is not currently available.")
-  }
+  checkmate::assert_subset(distribution, c("poisson", "quasipoisson", "negbin"), empty.ok = FALSE)
   switch(distribution,
     poisson = h_glm_poisson(.var, .df_row, variables, weights),
     quasipoisson = h_glm_quasipoisson(.var, .df_row, variables, weights),
-    negbin = list() # h_glm_negbin(.var, .df_row, variables, weights) # nolint
+    negbin = h_glm_negbin(.var, .df_row, variables, weights)
   )
 }
 

diff --git a/man/h_glm_count.Rd b/man/h_glm_count.Rd
diff --git a/man/summarize_glm_count.Rd b/man/summarize_glm_count.Rd
diff --git a/tests/testthat/_snaps/summarize_glm_count.md b/tests/testthat/_snaps/summarize_glm_count.md
@@ -18,6 +18,31 @@
       2 ARM B 9.155436 0.5997925 Inf    1  33.80055 1.935734e-250
       3 ARM C 7.855107 0.5871181 Inf    1  27.57650 2.129731e-167
 
+# h_glm_poisson glm-fit works with healthy input with covariates
+
+    Code
+      res
+    Output
+           Estimate         SE    z_value           Pr                coefs
+      1  2.01065582 0.18541942 10.8438255 2.133586e-27          (Intercept)
+      2  0.07631174 0.17896220  0.4264126 6.698072e-01          REGION1Asia
+      3  0.64425750 0.22389462  2.8775033 4.008358e-03       REGION1Eurasia
+      4  2.13096720 0.36521976  5.8347533 5.387022e-09        REGION1Europe
+      5 -0.07449500 0.20314837 -0.3667024 7.138410e-01 REGION1North America
+      6  0.38101695 0.21554753  1.7676703 7.711605e-02 REGION1South America
+      7  0.11047866 0.09872549  1.1190490 2.631192e-01        ARMB: Placebo
+      8 -0.17694419 0.10873176 -1.6273459 1.036637e-01    ARMC: Combination
+
+# h_glm_poisson emmeans-fit works with healthy input with covariates
+
+    Code
+      res
+    Output
+        ARMCD     rate std.error  df null statistic       p.value
+      1 ARM A 12.64167 1.2378669 Inf    1  25.90902 5.270655e-148
+      2 ARM B 14.11838 1.2848735 Inf    1  29.09088 4.682722e-186
+      3 ARM C 10.59153 0.9708089 Inf    1  25.74821 3.375733e-146
+
 # h_glm_quasipoisson glm-fit works with healthy input
 
     Code
@@ -43,6 +68,31 @@
       2     B: Placebo 14.11838  5.392442 Inf    1  6.931571 4.161914e-12
       3 C: Combination 10.59153  4.074355 Inf    1  6.135104 8.510352e-10
 
+# h_glm_negbin glm-fit works with healthy input
+
+    Code
+      res
+    Output
+           Estimate        SE    z_value           Pr                coefs
+      1 1.005041594 0.1992268 5.04471149 4.542062e-07          (Intercept)
+      2 0.007741431 0.1919877 0.04032253 9.678360e-01          REGION1Asia
+      3 0.317703043 0.2360653 1.34582686 1.783584e-01       REGION1Eurasia
+      4 0.591541717 0.4058327 1.45759983 1.449509e-01        REGION1Europe
+      5 0.117240049 0.2196300 0.53380718 5.934749e-01 REGION1North America
+      6 0.139971334 0.2348685 0.59595610 5.512046e-01 REGION1South America
+      7 0.113082781 0.1056295 1.07056107 2.843668e-01        ARMB: Placebo
+      8 0.026817451 0.1131811 0.23694292 8.127011e-01    ARMC: Combination
+
+# h_glm_negbin emmeans-fit works with healthy input
+
+    Code
+      res
+    Output
+                   ARM response std.error  df null statistic      p.value
+      1      A: Drug X 3.322579 0.3367532 Inf    1  11.84712 2.227054e-32
+      2     B: Placebo 3.720373 0.3782682 Inf    1  12.92183 3.390023e-38
+      3 C: Combination 3.412887 0.3424577 Inf    1  12.23369 2.054037e-34
+
 # h_glm_count glm-fit works with healthy input
 
     Code

diff --git a/tests/testthat/test-summarize_glm_count.R b/tests/testthat/test-summarize_glm_count.R
@@ -53,6 +53,43 @@ testthat::test_that("h_glm_poisson fails wrong inputs", {
   )
 })
 
+testthat::test_that("h_glm_poisson glm-fit works with healthy input with covariates", {
+  anl <- tern_ex_adtte %>%
+    filter(PARAMCD == "TNE")
+  anl$AVAL_f <- as.factor(anl$AVAL)
+
+  result <- h_glm_poisson(
+    .var = "AVAL",
+    .df_row = anl,
+    variables = list(arm = "ARM", offset = "lgTMATRSK", covariates = c("REGION1"))
+  )
+
+  mat1 <- summary(result$glm_fit)$coefficients %>% as.data.frame()
+  mat1$coefs <- row.names(mat1)
+  rownames(mat1) <- NULL
+  names(mat1) <- c("Estimate", "SE", "z_value", "Pr", "coefs")
+
+  res <- testthat::expect_silent(mat1)
+  testthat::expect_snapshot(res)
+})
+
+testthat::test_that("h_glm_poisson emmeans-fit works with healthy input with covariates", {
+  anl <- tern_ex_adtte %>%
+    filter(PARAMCD == "TNE")
+  anl$AVAL_f <- as.factor(anl$AVAL)
+
+  result <- h_glm_count(
+    .var = "AVAL",
+    .df_row = anl,
+    variables = list(arm = "ARMCD", offset = "lgTMATRSK", covariates = c("REGION1")),
+    distribution = "poisson"
+  )
+  mat1 <- as.data.frame(broom::tidy(result$emmeans_fit))
+
+  res <- testthat::expect_silent(mat1)
+  testthat::expect_snapshot(res)
+})
+
 testthat::test_that("h_glm_quasipoisson glm-fit works with healthy input", {
   anl <- tern_ex_adtte %>%
     filter(PARAMCD == "TNE")
@@ -107,6 +144,60 @@ testthat::test_that("h_glm_quasipoisson fails wrong inputs", {
   )
 })
 
+testthat::test_that("h_glm_negbin glm-fit works with healthy input", {
+  anl <- tern_ex_adtte %>%
+    filter(PARAMCD == "TNE")
+  anl$AVAL_f <- as.factor(anl$AVAL)
+
+  result <- h_glm_negbin(
+    .var = "AVAL",
+    .df_row = anl,
+    variables = list(arm = "ARM", offset = "lgTMATRSK", covariates = c("REGION1"))
+  )
+
+  mat1 <- summary(result$glm_fit)$coefficients %>% as.data.frame()
+  mat1$coefs <- row.names(mat1)
+  rownames(mat1) <- NULL
+  names(mat1) <- c("Estimate", "SE", "z_value", "Pr", "coefs")
+
+  res <- testthat::expect_silent(mat1)
+  testthat::expect_snapshot(res)
+})
+
+testthat::test_that("h_glm_negbin emmeans-fit works with healthy input", {
+  anl <- tern_ex_adtte %>%
+    filter(PARAMCD == "TNE")
+  anl$AVAL_f <- as.factor(anl$AVAL)
+
+  result <- h_glm_negbin(
+    .var = "AVAL",
+    .df_row = anl,
+    variables = list(arm = "ARM", offset = "lgTMATRSK", covariates = c("REGION1"))
+  )
+  mat1 <- as.data.frame(broom::tidy(result$emmeans_fit))
+
+  res <- testthat::expect_silent(mat1)
+  testthat::expect_snapshot(res)
+})
+
+testthat::test_that("h_glm_negbin fails wrong inputs", {
+  testthat::expect_error(
+    h_glm_negbin(
+      .var = "wrong.var",
+      .df_row = anl,
+      variables = list(arm = "ARM", offset = "lgTMATRSK", covariates = NULL)
+    )
+  )
+
+  testthat::expect_error(
+    h_glm_negbin(
+      .var = "AVAL",
+      .df_row = anl,
+      variables = list(arm = "ARM", offset = "lgTMATRSK", covariates = c("wrong.var"))
+    )
+  )
+})
+
 testthat::test_that("h_glm_count glm-fit works with healthy input", {
   anl <- tern_ex_adtte %>%
     filter(PARAMCD == "TNE")
@@ -258,20 +349,6 @@ testthat::test_that("s_glm_count fails wrong inputs", {
   ))
 })
 
-testthat::test_that("glm_count fails when negative binomial distribution is selected.", {
-  testthat::expect_error(glm_count(
-    df = anl %>%
-      filter(ARMCD == "ARM B"),
-    .df_row = anl,
-    .var = "AVAL",
-    .in_ref_col = FALSE,
-    variables = list(arm = "ARMCD", offset = "lgTMATRSK", covariates = c("REGION1")),
-    conf_level = 0.95,
-    distribution = "negbin",
-    rate_mean_method = "ppmeans"
-  ))
-})
-
 testthat::test_that("summarize_glm_count works with healthy inputs", {
   anl <- tern_ex_adtte %>%
     filter(PARAMCD == "TNE")