Merge pull request #257 from Gilead-BioStats/release-v0.2.0

Release v0.2.0
Gilead-BioStats · Mar 15, 2022 · 9a2d654 · 9a2d654
2 parents 178814e + 353d221
commit 9a2d654
Show file tree

Hide file tree

Showing 75 changed files with 2,056 additions and 2,750 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: gsm
 Title: Gilead Statistical Monitoring
-Version: 0.1.0
+Version: 0.2.0
 Authors@R: c(
     person("George", "Wu", email="george.wu@gilead.com", role = c("aut", "cre")),
     person("Jeremy", "Wildfire", email="jeremy.wildfire@gilead.com", role = c("aut")))
@@ -11,6 +11,8 @@ Imports:
     broom,
     dplyr,
     lubridate,
+    ggplot2,
+    lamW,
     magrittr,
     purrr,
     tidyr

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(AE_Map_Raw)
 export(Analyze_Chisq)
 export(Analyze_Fisher)
 export(Analyze_Poisson)
+export(Analyze_Poisson_PredictBounds)
 export(Analyze_Wilcoxon)
 export(Consent_Assess)
 export(Consent_Map_Raw)
@@ -18,10 +19,14 @@ export(PD_Assess)
 export(PD_Map_Raw)
 export(Summarize)
 export(Transform_EventCount)
+export(Visualize_Count)
+export(Visualize_Scatter)
 import(dplyr)
-import(lubridate)
+import(ggplot2)
 importFrom(broom,augment)
 importFrom(broom,glance)
+importFrom(lamW,lambertW0)
+importFrom(lamW,lambertWm1)
 importFrom(magrittr,"%>%")
 importFrom(purrr,map)
 importFrom(purrr,map_df)
@@ -33,5 +38,6 @@ importFrom(stats,median)
 importFrom(stats,offset)
 importFrom(stats,pnorm)
 importFrom(stats,poisson)
+importFrom(stats,reorder)
 importFrom(stats,wilcox.test)
 importFrom(tidyr,unnest)
diff --git a/R/AE_Assess.R b/R/AE_Assess.R
@@ -13,6 +13,7 @@
 #' - `SiteID` - Site ID
 #' - `Count` - Number of Adverse Events
 #' - `Exposure` - Number of days of exposure
+#' - `Rate` - Rate of Exposure (Count / Exposure)
 #'
 #' The Assessment
 #' - \code{\link{Transform_EventCount}} creates `dfTransformed`.
@@ -22,37 +23,38 @@
 #'
 #' @section Statistical Assumptions:
 #'
-#' A Poisson or Wilcoxon model is used to generate estimates and p-values for each site (as specified with the `cMethod` parameter). Those model outputs are then used to flag possible outliers using the thresholds specified in `vThreshold`. In the Poisson model, sites with an estimand less than -5 are flagged as -1 and greater than 5 are flagged as 1 by default. For Wilcoxon, sites with p-values less than 0.0001 are flagged by default.
+#' A Poisson or Wilcoxon model is used to generate estimates and p-values for each site (as specified with the `strMethod` parameter). Those model outputs are then used to flag possible outliers using the thresholds specified in `vThreshold`. In the Poisson model, sites with an estimand less than -5 are flagged as -1 and greater than 5 are flagged as 1 by default. For Wilcoxon, sites with p-values less than 0.0001 are flagged by default.
 #'
 #' See \code{\link{Analyze_Poisson}} and \code{\link{Analyze_Wilcoxon}} for additional details about the statistical methods and thier assumptions.
 #'
 #' @param dfInput input data with one record per person and the following required columns: SubjectID, SiteID, Count, Exposure
 #' @param vThreshold numeric vector with 2 threshold values.  Defaults to c(-5,5) for method = "poisson" and c(.0001,NA) for method = Wilcoxon.
-#' @param cLabel Assessment label
-#' @param cMethod valid methods are "poisson" (the default), or  "wilcoxon"
+#' @param strLabel Assessment label
+#' @param strMethod valid methods are "poisson" (the default), or  "wilcoxon"
 #' @param bDataList Should all assessment datasets be returned as a list? If False (the default), only the Summary data frame is returned
 #'
 #' @examples
 #' dfInput <- AE_Map_Adam( safetyData::adam_adsl, safetyData::adam_adae )
 #' SafetyAE <- AE_Assess( dfInput )
-#' SafetyAE_Wilk <- AE_Assess( dfInput, cMethod="wilcoxon")
+#' SafetyAE_Wilk <- AE_Assess( dfInput, strMethod="wilcoxon")
 #'
 #' @return If `bDataList` is false (the default), the summary data frame (`dfSummary`) is returned. If `bDataList` is true, a list containing all data in the standard data pipeline (`dfInput`, `dfTransformed`, `dfAnalyzed`, `dfFlagged` and `dfSummary`) is returned.
 #'
 #' @export
 
-AE_Assess <- function( dfInput, vThreshold=NULL, cLabel="", cMethod="poisson",bDataList=FALSE){
+AE_Assess <- function( dfInput, vThreshold=NULL, strLabel="", strMethod="poisson",bDataList=FALSE){
     stopifnot(
         "dfInput is not a data.frame" = is.data.frame(dfInput),
-        "cLabel is not character" = is.character(cLabel),
-        "cMethod is not 'poisson' or 'wilcoxon'" = cMethod %in% c("poisson","wilcoxon"),
+        "strLabel is not character" = is.character(strLabel),
+        "strMethod is not 'poisson' or 'wilcoxon'" = strMethod %in% c("poisson","wilcoxon"),
         "bDataList is not logical" = is.logical(bDataList),
-        "One or more of these columns: SubjectID, SiteID, Count, Exposure, and Rate not found in dfInput"=all(c("SubjectID","SiteID", "Count","Exposure", "Rate") %in% names(dfInput))
+        "One or more of these columns: SubjectID, SiteID, Count, Exposure, and Rate not found in dfInput"=all(c("SubjectID","SiteID", "Count","Exposure", "Rate") %in% names(dfInput)),
+        "strMethod must be length 1" = length(strMethod) == 1
     )
     lAssess <- list()
     lAssess$dfInput <- dfInput
-    lAssess$dfTransformed <- gsm::Transform_EventCount( lAssess$dfInput, cCountCol = 'Count', cExposureCol = "Exposure" )
-    if(cMethod == "poisson"){
+    lAssess$dfTransformed <- gsm::Transform_EventCount( lAssess$dfInput, strCountCol = 'Count', strExposureCol = "Exposure" )
+    if(strMethod == "poisson"){
         if(is.null(vThreshold)){
             vThreshold = c(-5,5)
         }else{
@@ -64,7 +66,9 @@ AE_Assess <- function( dfInput, vThreshold=NULL, cLabel="", cMethod="poisson",bD
         }
         lAssess$dfAnalyzed <- gsm::Analyze_Poisson( lAssess$dfTransformed)
         lAssess$dfFlagged <- gsm::Flag( lAssess$dfAnalyzed , strColumn = 'Residuals', vThreshold =vThreshold)
-    } else if(cMethod=="wilcoxon"){
+        lAssess$dfSummary <- gsm::Summarize( lAssess$dfFlagged, strScoreCol = 'Residuals', strAssessment="Safety", strLabel= strLabel)
+
+    } else if(strMethod=="wilcoxon"){
         if(is.null(vThreshold)){
             vThreshold = c(0.0001,NA)
         }else{
@@ -77,10 +81,9 @@ AE_Assess <- function( dfInput, vThreshold=NULL, cLabel="", cMethod="poisson",bD
         }
         lAssess$dfAnalyzed <- gsm::Analyze_Wilcoxon( lAssess$dfTransformed)
         lAssess$dfFlagged <- gsm::Flag( lAssess$dfAnalyzed ,  strColumn = 'PValue', vThreshold =vThreshold, strValueColumn = 'Estimate')
+        lAssess$dfSummary <- gsm::Summarize( lAssess$dfFlagged, strAssessment="Safety", strLabel= strLabel)
     }
 
-    lAssess$dfSummary <- gsm::Summarize( lAssess$dfFlagged, cAssessment="Safety", cLabel= cLabel)
-
     if(bDataList){
         return(lAssess)
     } else {

diff --git a/R/AE_Map_Raw.R b/R/AE_Map_Raw.R
@@ -20,8 +20,8 @@
 #'
 #' Note that the function can generate data summaries for specific types of AEs, but passing filtered ADAE data to dfADAE.
 #'
-#' @param dfAE AE dataset with columns SUBJID and rows for each AE record
-#' @param dfRDSL Subject-level Raw Data (RDSL) required columns: SubjectID, SiteID, value specified in strExposureCol
+#' @param dfAE AE dataset with required column SUBJID and rows for each AE record
+#' @param dfRDSL Subject-level Raw Data (RDSL) with required columns: SubjectID, SiteID, value specified in strExposureCol
 #' @param strExposureCol Name of exposure column. 'TimeOnTreatment' by default
 #'
 #' @return Data frame with one record per person data frame with columns: SubjectID, SiteID, Count (number of AEs), Exposure (Time on Treatment in Days), Rate (AE/Day)
@@ -40,7 +40,8 @@ AE_Map_Raw <- function( dfAE, dfRDSL, strExposureCol="TimeOnTreatment"){
         "SUBJID column not found in dfAE"="SUBJID" %in% names(dfAE),
         "strExposureCol is not character"=is.character(strExposureCol),
         "SubjectID, SiteID and strExposureCol columns not found in dfRDSL"=all(c("SubjectID","SiteID",strExposureCol) %in% names(dfRDSL)),
-        "NAs found in Subject ID column of dfAE" = all(!is.na(dfAE$SUBJID))
+        "NAs found in SUBJID column of dfAE" = all(!is.na(dfAE$SUBJID)),
+        "NAs found in Subject ID column of dfRDSL" = all(!is.na(dfRDSL$SubjectID))
     )
 
     dfInput <-  dfRDSL %>%

diff --git a/R/Analyze_Chisq.R b/R/Analyze_Chisq.R
@@ -29,7 +29,7 @@
 #'
 #' @examples
 #' dfInput <- Disp_Map(dfDisp = safetyData::adam_adsl, strCol = "DCREASCD",strReason = "Adverse Event")
-#' dfTransformed <- Transform_EventCount( dfInput, cCountCol = 'Count' )
+#' dfTransformed <- Transform_EventCount( dfInput, strCountCol = 'Count' )
 #' dfAnalyzed <- Analyze_Chisq( dfTransformed )
 #'
 #' @export

diff --git a/R/Analyze_Fisher.R b/R/Analyze_Fisher.R
@@ -2,7 +2,7 @@
 #'
 #' Creates Analysis results data for count data using the Fisher's exact test
 #'
-#'  @details
+#' @details
 #'
 #' Analyzes count data using the Fisher's exact test
 #'
@@ -12,16 +12,17 @@
 #'
 #' @section Data Specification:
 #'
-#' The input data (` dfTransformed`) for the Analyze_Fisher is typically created using \code{\link{Transform_EventCount}}  and should be one record per Site with columns for:
+#' The input data (`dfTransformed`) for Analyze_Fisher is typically created using \code{\link{Transform_EventCount}} and should be one record per site with required columns for:
 #' - `SiteID` - Site ID
 #' - `N` - Total number of participants at site
-#' - `Count` - Total number of participants at site with event of interest
+#' - `TotalCount` - Total number of participants at site with event of interest
 #'
 #'
 #' @param  dfTransformed  data.frame in format produced by \code{\link{Transform_EventCount}}
-#' @param  strOutcome required, name of column in dfTransformed dataset to perform Fisher test on
+#' @param  strOutcome required, name of column in dfTransformed dataset to perform Fisher test on. Default is "TotalCount".
 #'
-#' @importFrom stats fisher.test as.formula
+#' @import dplyr
+#' @importFrom stats fisher.test
 #' @importFrom purrr map
 #' @importFrom broom glance
 #' @importFrom tidyr unnest
@@ -30,16 +31,19 @@
 #'
 #' @examples
 #' dfInput <- Disp_Map(dfDisp = safetyData::adam_adsl, strCol = "DCREASCD",strReason = "Adverse Event")
-#' dfTransformed <- Transform_EventCount( dfInput, cCountCol = 'Count' )
+#' dfTransformed <- Transform_EventCount( dfInput, strCountCol = 'Count' )
 #' dfAnalyzed <- Analyze_Fisher( dfTransformed )
 #'
 #' @export
 
 Analyze_Fisher <- function( dfTransformed , strOutcome = "TotalCount") {
 
     stopifnot(
-        is.data.frame(dfTransformed),
-        all(c("SiteID", "N", strOutcome) %in% names(dfTransformed))
+        "dfTransformed is not a data.frame" = is.data.frame(dfTransformed),
+        "One or more of these columns: SiteID, N, or the value in strOutcome not found in dfTransformed" = all(c("SiteID", "N", strOutcome) %in% names(dfTransformed)),
+        "NA value(s) found in SiteID" = all(!is.na(dfTransformed[["SiteID"]])),
+        "strOutcome must be length 1" = length(strOutcome) == 1,
+        "strOutcome is not character" = is.character(strOutcome)
     )
 
     fisher_model<- function(site){
@@ -58,7 +62,7 @@ Analyze_Fisher <- function( dfTransformed , strOutcome = "TotalCount") {
     dfAnalyzed <- dfTransformed %>%
         mutate(model = map(.data$SiteID, fisher_model)) %>%
         mutate(summary = map(.data$model, broom::glance)) %>%
-        unnest(summary) %>%
+        tidyr::unnest(summary) %>%
         rename(
             Estimate = .data$estimate,
             PValue = .data[['p.value']]

diff --git a/R/Analyze_Poisson.R b/R/Analyze_Poisson.R
diff --git a/R/Analyze_Poisson_PredictBounds.R b/R/Analyze_Poisson_PredictBounds.R
@@ -0,0 +1,60 @@
+#' Poisson Analysis - Predicted Boundaries
+#'
+#' @details
+#'
+#' Fits a Poisson model to site level data and then calculates predicted count values and upper- and lower- bounds for across the full range of exposure values.
+#'
+#' @section Statistical Methods:
+#'
+#' This function fits a poisson model to site-level data and then calculates residuals for each site. The poisson model is run using standard methods in the `stats` package by fitting a `glm` model with family set to `poisson` using a "log" link. Upper and lower boundary values are then calculated using the method described here TODO: Add link. In short,
+#'
+#' @section Data Specification:
+#'
+#' The input data (` dfTransformed`) for the Analyze_Poisson is typically created using \code{\link{Transform_EventCount}} and should be one record per Site with columns for:
+#' - `SubjectID` - Unique subject ID
+#' - `SiteID` - Site ID
+#' - `TotalCount` - Number of Events
+#' - `TotalExposure` - Number of days of exposure
+#'
+#' @param dfTransformed data.frame in format produced by \code{\link{Transform_EventCount}}. Must include SubjectID, SiteID, TotalCount and TotalExposure.
+#' @param vThreshold upper and lower boundaries in residual space. Should be identical to the threhsolds used AE_Assess().
+#'
+#' @importFrom stats glm offset poisson pnorm
+#' @importFrom broom augment
+#' @importFrom lamW lambertW0 lambertWm1
+#'
+#' @return data frame containing predicted boundary values with upper and lower bounds across the range of observed values
+#'
+#' @examples
+#' dfInput <- AE_Map_Adam( safetyData::adam_adsl, safetyData::adam_adae )
+#' dfTransformed <- Transform_EventCount( dfInput, strCountCol = 'Count', strExposureCol = "Exposure" )
+#' dfBounds <- Analyze_Poisson_PredictBounds(dfTransformed, c(-5,5))
+#'
+#' @export
+Analyze_Poisson_PredictBounds <- function( dfTransformed, vThreshold=c(-5,5)){
+  dfTransformed$LogExposure <- log(dfTransformed$TotalExposure)
+  cModel <- glm(
+    TotalCount ~ stats::offset(LogExposure),
+    family=poisson(link="log"),
+    data=dfTransformed
+  )
+
+  dfBounds <- data.frame(
+    LogExposure = seq(
+    min(dfTransformed$LogExposure)-0.05,
+    max(dfTransformed$LogExposure)+0.05,
+    by=0.05
+  )) %>%
+  mutate( vMu = as.numeric( exp( .data$LogExposure * cModel$coefficients[2] + cModel$coefficients[1] ))) %>%
+  mutate( vWHi = (vThreshold[2]^2 - 2 * .data$vMu)  / ( 2 * exp(1) * .data$vMu )) %>%
+  mutate( vWLo = (vThreshold[1]^2 - 2 * .data$vMu)  / ( 2 * exp(1) * .data$vMu )) %>%
+  mutate( PredictYHigh = ( vThreshold[2]^2-2* .data$vMu) / (2*lamW::lambertW0( .data$vWHi ))) %>%
+  mutate( PredictYLo = ( vThreshold[1]^2-2* .data$vMu) / (2*lamW::lambertWm1( .data$vWLo ))) %>%
+  mutate( MeanCount = exp( .data$LogExposure * cModel$coefficients[2] + cModel$coefficients[1])) %>%
+  mutate( LowerCount = if_else(is.nan( .data$PredictYLo ), 0, .data$PredictYLo )) %>%
+  mutate( UpperCount = if_else(is.nan( .data$PredictYHigh ), 0, .data$PredictYHigh )) %>%
+  select( .data$LogExposure, .data$MeanCount, .data$LowerCount, .data$UpperCount )
+
+
+  return( dfBounds )
+}