Skip to content

Commit

Permalink
Merge pull request #35 from spriyansh/main
Browse files Browse the repository at this point in the history
Object Updates with Link, Diagnostics and Offset
  • Loading branch information
spriyansh authored Feb 21, 2024
2 parents df67ec5 + 333267d commit 3a0e041
Show file tree
Hide file tree
Showing 16 changed files with 330 additions and 13 deletions.
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export(eSparse)
export(m3_select_path)
export(pathAssign)
export(plotBinTile)
export(plotDiagnostics)
export(plotIntersect)
export(plotTrend)
export(plotTrendCluster)
Expand Down Expand Up @@ -149,6 +150,7 @@ importFrom(stats,cor)
importFrom(stats,cutree)
importFrom(stats,dist)
importFrom(stats,family)
importFrom(stats,fitted)
importFrom(stats,gaussian)
importFrom(stats,glm)
importFrom(stats,glm.control)
Expand All @@ -159,8 +161,12 @@ importFrom(stats,median)
importFrom(stats,na.omit)
importFrom(stats,p.adjust)
importFrom(stats,poisson)
importFrom(stats,predict)
importFrom(stats,residuals)
importFrom(stats,rstandard)
importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(stringr,str_remove)
importFrom(stringr,str_remove_all)
importFrom(stringr,str_split)
importFrom(stringr,str_split_i)
Expand Down
15 changes: 12 additions & 3 deletions R/ScMaSigProClass.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#' model.
#' @slot mt_correction A character string specifying the p-value correction
#' method.
#' @slot link Type of link function to use in the model. Default is "log".
#' @slot epsilon Model convergence tolerance.
#' @slot selection_method A character string specifying the method for stepwise
#' regression.
Expand Down Expand Up @@ -73,6 +74,7 @@ setClass(
p_value = "numeric",
min_na = "numeric",
mt_correction = "character",
link = "character",
epsilon = "numeric",
offset = "logical",
log_offset = "logical",
Expand All @@ -91,7 +93,7 @@ setClass(
"bin_ptime_col", "path_prefix", "root_label", "ptime_col", "bin_method",
"path_col", "bin_col", "bin_size_col", "bin_mem_col",
"mt_correction", "selection_method", "anno_col", "cluster_method",
"use_dim", "fill_na"
"use_dim", "fill_na", "link"
)

for (slot_name in char_slots) {
Expand Down Expand Up @@ -146,6 +148,7 @@ setClass(
bin_mem_col = "scmp_bin_members",
mt_correction = "BH",
selection_method = "backward",
link = "log",
epsilon = 1e-8,
offset = TRUE,
log_offset = FALSE,
Expand All @@ -172,6 +175,7 @@ setClass(
#' @slot assignment_matrix A matrix containing binary assignment to branching paths
#' Additionally has two columns for assignment of binned Pseudotime and
#' replicate.
#' @slot offset A numeric vector containing the offset values.
#'
#' @name MatrixDesign
#' @aliases MatrixDesign-class
Expand All @@ -183,12 +187,14 @@ setClass(
representation(
predictor_matrix = "matrix",
groups.vector = "character",
assignment_matrix = "matrix"
assignment_matrix = "matrix",
offset = "numeric"
),
prototype = list(
predictor_matrix = matrix(NA, nrow = 0, ncol = 0),
groups.vector = character(),
assignment_matrix = matrix(NA, nrow = 0, ncol = 0)
assignment_matrix = matrix(NA, nrow = 0, ncol = 0),
offset = numeric()
),
validity = function(object) {
if (!validObject(object@predictor_matrix)) {
Expand All @@ -200,6 +206,9 @@ setClass(
if (!validObject(object@assignment_matrix)) {
stop("assignment_matrix slot is not a valid matrix.")
}
if (!validObject(object@offset)) {
stop("offset slot is not a valid vector.")
}
TRUE
}
)
Expand Down
235 changes: 235 additions & 0 deletions R/plotDiagnostics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
#' @title Plot Model Diagnostics
#'
#' @description
#' Plots model residuals for the requested gene. Can be used for model
#' diagnostics of optimized model or full model.
#'
#' @importFrom stats residuals fitted rstandard predict
#' @importFrom stringr str_remove
#'
#' @param scmpObj An object of class \code{\link{ScMaSigPro}}.
#' @param feature_id Name of the gene to be plotted.
#' @param model Type of model to be used for plotting. Can be either 'optimized',
#' 'intercept'and 'full'. Default is 'optimized'.
#'
#' @return ggplot2 plot object.
#'
#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com}
#'
#' @export
#'
plotDiagnostics <- function(scmpObj,
feature_id,
model = "optimized") {
# Check if the results exist
assert_that(!isEmpty(scmpObj@Significant@genes),
msg = paste("No Significant genes found, please run the workflow first")
)

# Check model type
assert_that(all(model %in% c("optimized", "full", "intercept")),
msg = paste("The requested gene is not available in the significant genes list")
)

# Get all the available genes
avail_genes <- unique(unlist(scmpObj@Significant@genes))

# Check if the requested gene is availble as singificant gene
assert_that(all(feature_id %in% avail_genes),
msg = paste("The requested gene is not available in the significant genes list")
)

# Get the counts for the genes
y_mat <- as.matrix(scmpObj@Dense@assays@data$bulk.counts)

# Extract gene counts
y_df <- y_mat[feature_id, , drop = TRUE] %>%
as.data.frame()
colnames(y_df) <- "y"

# Set theme for the plot
theme_set <- theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
panel.grid.major = element_line(
color = "grey90", linewidth = 0.3, linetype = "dashed"
),
panel.grid.minor = element_blank()
)

# Based on model
if (model == "full" || model == "intercept") {
# Create Model input
regression_matrix <- scmpObj@Design@predictor_matrix
} else {
# Get coefficients
coeff_matrix <- showCoeff(scmpObj) %>%
data.frame()

# Subset
coeff_matrix_sub <- coeff_matrix[feature_id, , drop = FALSE]
all_terms <- colnames(coeff_matrix_sub)[-1]

# Remove any occurance of beta
all_terms <- str_remove(string = all_terms, pattern = "beta")

# Remove intercept
all_terms <- all_terms[-1]

# Get the terms
terms <- coeff_matrix_sub[apply(coeff_matrix_sub, 2, FUN = function(i) {
i != 0
})]

# Extract the prediction
prediction_matrix <- scmpObj@Design@predictor_matrix

# Create Model input
regression_matrix <- prediction_matrix[, all_terms, drop = FALSE]
}

# Combine regression matrix with counts
model_df <- cbind(y_df, regression_matrix)

# Extract offsets
if (scmpObj@Parameters@offset) {
offset_data <- scmpObj@Design@offset
} else {
offset_data <- NULL
}

if (model == "intercept") {
glm_s3 <- glm(y ~ 1,
data = as.data.frame(model_df),
family = scmpObj@Parameters@distribution,
epsilon = scmpObj@Parameters@epsilon,
offset = offset_data,
weights = NULL,
maxit = scmpObj@Parameters@max_it
)
} else {
# Fit Model
glm_s3 <- glm(y ~ .,
data = as.data.frame(model_df),
family = scmpObj@Parameters@distribution,
epsilon = scmpObj@Parameters@epsilon,
offset = offset_data,
weights = NULL,
maxit = scmpObj@Parameters@max_it
)
}
# Extract residuals
raw_residuals <- residuals(glm_s3, type = "response")
pearson_residuals <- residuals(glm_s3, type = "pearson")
deviance_residuals <- residuals(glm_s3, type = "deviance")
standardized_residuals <- rstandard(glm_s3)

# Extract fitted values
fitted_values <- fitted(glm_s3)

# Calculate predicted responses
predicted_responses <- predict(glm_s3, type = "response")

# Q-Q plot
# This plot of the standardized residuals allows you to check if the
# residuals are approximately normally distributed, which is an
# Calculate standardized residuals

# Create a dataframe for plotting
qq_df <- data.frame(std_residuals = standardized_residuals)

# Create the Normal Q-Q plot
qq_plot <- ggplot(qq_df, aes(sample = .data$std_residuals)) +
stat_qq(color = "#EE446F", size = 2) +
stat_qq_line(
color = "#15918A", alpha = 1,
linewidth = 1, linetype = "solid"
) +
labs(
subtitle = feature_id,
title = "A. Normal Q-Q Plot of Standardized Residuals",
x = "Theoretical Quantiles",
y = "Standardized Residuals"
) +
theme_set

# Residuals vs Fitted Plot:
# This plot helps to detect non-linearity, unequal error variances,
# and outliers. It is used to check the assumption of homoscedasticity.
# Assuming your glm model is stored in 'glm_s3'

# Create a dataframe for plotting
res_vs_fit_df <- data.frame(
Fitted = fitted_values,
StdResiduals = standardized_residuals
)

# Create the Residuals vs Fitted plot
stdRes_vs_fitted_plot <- ggplot(res_vs_fit_df, aes(x = .data$Fitted, y = .data$StdResiduals)) +
geom_point(color = "#EE446F", size = 2) + # Plot the points
geom_hline(yintercept = 0, linetype = "dashed", color = "#9F7BB8") + # Add a horizontal line at 0
labs(
subtitle = feature_id,
title = "B. Standardized Residuals vs Fitted Plot", x = "Fitted Values", y = "Standardized Residuals"
) +
theme_set +
geom_smooth(
method = "loess", color = "#15918A", alpha = 0.4, formula = y ~ x,
linewidth = 1, linetype = "solid", se = TRUE, fill = "#FDC659"
) # Add a loess smoothed line

# Scale-Location Plot (or Spread-Location Plot): This shows if residuals
# are spread equally along the ranges of predictors. This is a way to check
# the assumption of equal variance (homoscedasticity).
# Calculate square root of absolute standardized residuals
sqrt_abs_std_residuals <- sqrt(abs(standardized_residuals))

# Create a dataframe for plotting
scale_loc_df <- data.frame(
Fitted = fitted_values,
SqrtAbsStdResiduals = sqrt_abs_std_residuals
)

# Create the Scale-Location plot
scale_loc_plot <- ggplot(scale_loc_df, aes(x = .data$Fitted, y = .data$SqrtAbsStdResiduals)) +
geom_point(color = "#EE446F", size = 2) + # Plot the points
geom_smooth(
method = "loess", color = "#15918A", formula = y ~ x,
linewidth = 1, linetype = "solid", se = TRUE,
fill = "#FDC659", alpha = 0.4
) +
labs(
subtitle = feature_id,
title = "C. Scale-Location Plot",
x = "Fitted Values",
y = "Sqrt of Absolute Standardized Residuals"
) +
theme_set

# A Dfun plot is less standard and typically refers to a plot that can help
# assess the appropriateness of the link function in a GLM. This might involve
# plotting observed versus expected responses or derivatives of the link function.
# The exact implementation can depend on the specific family and link function used
# in your model. For a basic implementation, you might plot observed vs. predicted responses.

# Create a dataframe for plotting
dfun_df <- data.frame(Observed = model_df$y, Predicted = predicted_responses)

# Create the Dfun plot
defunc_plot <- ggplot(dfun_df, aes(x = .data$Observed, y = .data$Predicted)) +
geom_point(color = "#EE446F", size = 2) +
geom_abline(
intercept = 0, slope = 1, linetype = "solid", color = "#15918A",
linewidth = 1, alpha = 1
) +
labs(
subtitle = feature_id,
title = "D. Observed vs Predicted",
x = "Observed",
y = "Predicted"
) +
theme_set

# Return Plots
return((qq_plot + stdRes_vs_fitted_plot) / (scale_loc_plot + defunc_plot))
}
16 changes: 14 additions & 2 deletions R/sc.p.vector.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#' @param min_na Minimum values needed per gene across cells to estimate the
#' model.
#' @param family Distribution of the error term.
#' @param link Type of link function to use in the model. Default is "log".
#' @param epsilon Model convergence tolerance.
#' @param offset logical value specifying whether to use offset during fitting.
#' @param log_offset A logical value specifying whether to take the logarithm of
Expand Down Expand Up @@ -48,7 +49,8 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
offset = TRUE,
parallel = FALSE,
log_offset = FALSE,
max_it = 100) {
max_it = 100,
link = "log") {
# Check the type of the 'design' parameter and set the corresponding variables
assert_that(is(scmpObj, "ScMaSigPro"),
msg = "Please provide object of class 'ScMaSigPro'"
Expand All @@ -64,6 +66,14 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
dat <- dat[, as.character(rownames(dis))]
G <- nrow(dat)

# Check for the log function
assert_that(link %in% c("log", "identity"),
msg = "link function should be either 'log' or 'identity'"
)

# Update the family
family[["link"]] <- link

# Add check
# assert_that((dat@Dim[1] > 1), msg = paste(min_na, "for 'min_na' is too high. Try lowering the threshold."))
assert_that(min_na <= ncol(dat),
Expand Down Expand Up @@ -109,8 +119,9 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
if (log_offset) {
offsetData <- log(offsetData)
}
scmpObj@Design@offset <- offsetData
} else {
offsetData <- NULL
offsetData <- NULL # scmpObj@Design@offset
}

if (parallel) {
Expand Down Expand Up @@ -257,6 +268,7 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
scmpObj@Parameters@mt_correction <- mt_correction
scmpObj@Parameters@epsilon <- epsilon
scmpObj@Parameters@distribution <- family
scmpObj@Parameters@link <- link

return(scmpObj)
}
Expand Down
Loading

0 comments on commit 3a0e041

Please sign in to comment.