Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Flag functions #1920

Merged
merged 17 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ export(CombineSpecs)
export(FilterByFlags)
export(FilterByLatestSnapshotDate)
export(Flag)
export(Flag_Fisher)
export(Flag_NormalApprox)
export(Flag_Poisson)
export(GetStrFunctionIfNamespaced)
Expand All @@ -37,7 +36,6 @@ export(MakeParamLabelsList)
export(MakeStudyInfo)
export(MakeWideGroups)
export(MakeWorkflowList)
export(Make_Timeline)
export(ParseThreshold)
export(RenderRmd)
export(Report_FlagOverTime)
Expand Down
123 changes: 75 additions & 48 deletions R/Flag.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,76 +29,103 @@
#'
#' @param dfAnalyzed `data.frame` where flags should be added.
#' @param strColumn `character` Name of the column to use for thresholding. Default: `"Score"`
#' @param vThreshold `numeric` Vector of 2 numeric values representing lower and upper threshold values. All
#' values in `strColumn` are compared to `vThreshold` using strict comparisons. Values less than
#' the lower threshold or greater than the upper threshold are flagged. Values equal to the
#' threshold values are set to 0 (i.e., not flagged). If NA is provided for either threshold value,
#' it is ignored and no values are flagged based on the threshold. NA and NaN values in `strColumn`
#' are given NA flag values.
#' @param strValueColumn `character` Name of the column to use for sign of `Flag.` If the value for
#' that row is higher than the median of `strValueColumn`, then `Flag` is set to 1. Similarly, if
#' the value for that row is lower than the median of `strValueColumn`, then Flag is set to -1.
#'
#' @return `data.frame` with one row per site with columns: `GroupID`, `TotalCount`, `Metric`, `Score`, `Flag`
#' @param vThreshold `numeric` Vector of numeric values representing threshold values. Default is `c(-3,-2,2,3)` which is typical for z-scores.
#' @param vFlag `numeric` Vector of flag values. There must be one more item in Flag than thresholds - that is `length(vThreshold)+1 == length(vFlagValues)`. Default is `c(-2,-1,0,1,2)`, which is typical for z-scores.
#' @param vFlagOrder `numeric` Vector of ordered flag values. Output data.frame will be sorted based on flag column using the order provided. NULL (or values that don't match vFlag) will leave the data unsorted. Must have identical values to vFlag. Default is `c(2,-2,1,-1,0)` which puts largest z-score outliers first in the data set.
#'
#' @examples
#' dfTransformed <- Transform_Count(analyticsInput, strCountCol = "Numerator")
#' @return `data.frame` dfAnalyzed is returned with an additional `Flag` column.
#'
#' dfAnalyzed <- Analyze_Identity(dfTransformed)
#' @examples
#'
#' dfFlagged <- Flag(dfAnalyzed, vThreshold = c(0.001, 0.01))
#' dfTransformed <- Transform_Rate(analyticsInput)
#' dfAnalyzed <- Analyze_NormalApprox(dfTransformed)
#' dfFlagged <- Flag(dfAnalyzed)
#'
#' @export

Flag <- function(
dfAnalyzed,
strColumn = "Score",
vThreshold = NULL,
strValueColumn = NULL
vThreshold = c(-3,-2,2,3),
vFlag = c(-2,-1,0,1,2),
vFlagOrder = c(2,-2,1,-1,0)
) {

stop_if(cnd = !is.data.frame(dfAnalyzed), message = "dfAnalyzed is not a data frame")
stop_if(cnd = !is.character(strColumn), message = "strColumn is not character")
stop_if(cnd = !is.numeric(vThreshold), message = "vThreshold is not numeric")
stop_if(cnd = length(vThreshold) != 2, message = "vThreshold must be length of 2")
stop_if(cnd = !all(vThreshold == sort(vThreshold)), message = "vThreshold is not in ascending order")
stop_if(cnd = is.null(vThreshold), message = "vThreshold cannot be NULL")
stop_if(cnd = length(strColumn) != 1, message = "strColumn must be length of 1")
stop_if(cnd = !(strColumn %in% names(dfAnalyzed)), message = "strColumn not found in dfAnalyzed")
stop_if(cnd = !("GroupID" %in% names(dfAnalyzed)), message = "GroupID not found in dfAnalyzed")
stop_if(cnd = !is.numeric(vFlag), message = "vFlag must be numeric")
stop_if(cnd = length(vFlag) != length(vThreshold)+1, message = "Improper number of Flag values provided")
stop_if(cnd = !is.numeric(vFlagOrder) & !is.null(vFlagOrder), message = "vFlagOrder must be numeric or NULL")

if (all(!is.na(vThreshold))) {
stop_if(cnd = vThreshold[2] <= vThreshold[1], "vThreshold must contain a minimum and maximum value (i.e., vThreshold = c(1, 2))")
}
dfFlagged <- dfAnalyzed

# Flag values outside the specified threshold.
dfFlagged <- dfAnalyzed %>%
mutate(
Flag = case_when(
!is.na(vThreshold[1]) & (.data[[strColumn]] < vThreshold[1]) ~ -1,
!is.na(vThreshold[2]) & (.data[[strColumn]] > vThreshold[2]) ~ 1,
!is.na(.data[[strColumn]]) & !is.nan(.data[[strColumn]]) ~ 0
# generate flag values for dfAnalyzed[strColumn] based on vThresold and vFlag
dfFlagged$Flag <- cut(
dfFlagged[[strColumn]],
breaks = c(-Inf, vThreshold, Inf),
labels = vFlag,
right = FALSE
) %>% as.character() %>% as.numeric() #Parse from factor to numeric

# Apply custom sort order using vFlagOrder
if(!is.null(vFlagOrder)){
#all values in vFlag should be included in vFlagOrder
if(identical(sort(vFlag), sort(vFlagOrder))){
dfFlagged <- dfFlagged %>% arrange(match(.data$Flag, vFlagOrder))
LogMessage(
level = "info",
message = "Sorted dfFlagged using custom Flag order: {vFlagOrder}.",
cli_detail = "alert_info"
)
} else {
LogMessage(
level = "info",
message = "Mismatch in vFlagOrder and vFlag values. Aborting Sort and returning unsorted data.",
cli_detail = "alert_info"
)
}
}

# If strValueColumn is supplied, it can only affect sign of Flag (1 or -1).
if (!is.null(strValueColumn)) {
stop_if(cnd = !(strValueColumn %in% names(dfAnalyzed)), message = "strValueColumn not found in dfAnalyzed")
return(dfFlagged)
}

#' Flag_NormalApprox
#'
#' #' @description
#' `r lifecycle::badge("stable")`
#'
#' Alias for `Flag()`
#'
#' @param dfAnalyzed `data.frame` where flags should be added.
#' @param strColumn `character` Name of the column to use for thresholding. Default: `"Score"`
#' @param vThreshold `numeric` Vector of numeric values representing threshold values. Default is `c(-3,-2,2,3)` which is typical for z-scores.
#' @param vFlag `numeric` Vector of flag values. There must be one more item in Flag than thresholds - that is `length(vThreshold)+1 == length(vFlagValues)`. Default is `c(-2,-1,0,1,2)`, which is typical for z-scores.
#' @param vFlagOrder `numeric` Vector of ordered flag values. Output data.frame will be sorted based on flag column using the order provided. NULL (or values that don't match vFlag) will leave the data unsorted. Must have identical values to vFlag. Default is `c(2,-2,1,-1,0)` which puts largest z-score outliers first in the data set.
#'
#'
#' @export

nMedian <- dfFlagged %>%
pull(strValueColumn) %>%
stats::median(na.rm = TRUE)
Flag_NormalApprox <- Flag

dfFlagged <- dfFlagged %>%
mutate(
Flag = case_when(
Flag != 0 & .data[[strValueColumn]] >= nMedian ~ 1,
Flag != 0 & .data[[strValueColumn]] < nMedian ~ -1,
TRUE ~ Flag
)
)
}

dfFlagged <- dfFlagged %>%
arrange(match(.data$Flag, c(1, -1, 0)))
#' Flag_Poisson
#'
#' #' @description
#' `r lifecycle::badge("stable")`
#'
#' Alias for `Flag()`
#'
#' @param dfAnalyzed `data.frame` where flags should be added.
#' @param strColumn `character` Name of the column to use for thresholding. Default: `"Score"`
#' @param vThreshold `numeric` Vector of numeric values representing threshold values. Default is `c(-3,-2,2,3)` which is typical for z-scores.
#' @param vFlag `numeric` Vector of flag values. There must be one more item in Flag than thresholds - that is `length(vThreshold)+1 == length(vFlagValues)`. Default is `c(-2,-1,0,1,2)`, which is typical for z-scores.
#' @param vFlagOrder `numeric` Vector of ordered flag values. Output data.frame will be sorted based on flag column using the order provided. NULL (or values that don't match vFlag) will leave the data unsorted. Must have identical values to vFlag. Default is `c(2,-2,1,-1,0)` which puts largest z-score outliers first in the data set.
#'
#' @export

return(dfFlagged)
}
Flag_Poisson <- Flag
68 changes: 0 additions & 68 deletions R/Flag_Fisher.R

This file was deleted.

85 changes: 0 additions & 85 deletions R/Flag_NormalApprox.R

This file was deleted.

75 changes: 0 additions & 75 deletions R/Flag_Poisson.R

This file was deleted.

Loading
Loading