Merge pull request #35 from spriyansh/main

Object Updates with Link, Diagnostics and Offset
BioBam · Feb 21, 2024 · 3a0e041 · 3a0e041
2 parents df67ec5 + 333267d
commit 3a0e041
Show file tree

Hide file tree

Showing 16 changed files with 330 additions and 13 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -16,6 +16,7 @@ export(eSparse)
 export(m3_select_path)
 export(pathAssign)
 export(plotBinTile)
+export(plotDiagnostics)
 export(plotIntersect)
 export(plotTrend)
 export(plotTrendCluster)
@@ -149,6 +150,7 @@ importFrom(stats,cor)
 importFrom(stats,cutree)
 importFrom(stats,dist)
 importFrom(stats,family)
+importFrom(stats,fitted)
 importFrom(stats,gaussian)
 importFrom(stats,glm)
 importFrom(stats,glm.control)
@@ -159,8 +161,12 @@ importFrom(stats,median)
 importFrom(stats,na.omit)
 importFrom(stats,p.adjust)
 importFrom(stats,poisson)
+importFrom(stats,predict)
+importFrom(stats,residuals)
+importFrom(stats,rstandard)
 importFrom(stats,sd)
 importFrom(stats,setNames)
+importFrom(stringr,str_remove)
 importFrom(stringr,str_remove_all)
 importFrom(stringr,str_split)
 importFrom(stringr,str_split_i)

diff --git a/R/ScMaSigProClass.R b/R/ScMaSigProClass.R
@@ -36,6 +36,7 @@
 #' model.
 #' @slot mt_correction A character string specifying the p-value correction
 #' method.
+#' @slot link Type of link function to use in the model. Default is "log".
 #' @slot epsilon Model convergence tolerance.
 #' @slot selection_method A character string specifying the method for stepwise
 #' regression.
@@ -73,6 +74,7 @@ setClass(
     p_value = "numeric",
     min_na = "numeric",
     mt_correction = "character",
+    link = "character",
     epsilon = "numeric",
     offset = "logical",
     log_offset = "logical",
@@ -91,7 +93,7 @@ setClass(
       "bin_ptime_col", "path_prefix", "root_label", "ptime_col", "bin_method",
       "path_col", "bin_col", "bin_size_col", "bin_mem_col",
       "mt_correction", "selection_method", "anno_col", "cluster_method",
-      "use_dim", "fill_na"
+      "use_dim", "fill_na", "link"
     )
 
     for (slot_name in char_slots) {
@@ -146,6 +148,7 @@ setClass(
     bin_mem_col = "scmp_bin_members",
     mt_correction = "BH",
     selection_method = "backward",
+    link = "log",
     epsilon = 1e-8,
     offset = TRUE,
     log_offset = FALSE,
@@ -172,6 +175,7 @@ setClass(
 #' @slot assignment_matrix A matrix containing binary assignment to branching paths
 #' Additionally has two columns for assignment of binned Pseudotime and
 #' replicate.
+#' @slot offset A numeric vector containing the offset values.
 #'
 #' @name MatrixDesign
 #' @aliases MatrixDesign-class
@@ -183,12 +187,14 @@ setClass(
   representation(
     predictor_matrix = "matrix",
     groups.vector = "character",
-    assignment_matrix = "matrix"
+    assignment_matrix = "matrix",
+    offset = "numeric"
   ),
   prototype = list(
     predictor_matrix = matrix(NA, nrow = 0, ncol = 0),
     groups.vector = character(),
-    assignment_matrix = matrix(NA, nrow = 0, ncol = 0)
+    assignment_matrix = matrix(NA, nrow = 0, ncol = 0),
+    offset = numeric()
   ),
   validity = function(object) {
     if (!validObject(object@predictor_matrix)) {
@@ -200,6 +206,9 @@ setClass(
     if (!validObject(object@assignment_matrix)) {
       stop("assignment_matrix slot is not a valid matrix.")
     }
+    if (!validObject(object@offset)) {
+      stop("offset slot is not a valid vector.")
+    }
     TRUE
   }
 )

diff --git a/R/plotDiagnostics.R b/R/plotDiagnostics.R
@@ -0,0 +1,235 @@
+#' @title Plot Model Diagnostics
+#'
+#' @description
+#' Plots model residuals for the requested gene. Can be used for model
+#' diagnostics of optimized model or full model.
+#'
+#' @importFrom stats residuals fitted rstandard predict
+#' @importFrom stringr str_remove
+#'
+#' @param scmpObj An object of class \code{\link{ScMaSigPro}}.
+#' @param feature_id Name of the gene to be plotted.
+#' @param model Type of model to be used for plotting. Can be either 'optimized',
+#' 'intercept'and 'full'. Default is 'optimized'.
+#'
+#' @return ggplot2 plot object.
+#'
+#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com}
+#'
+#' @export
+#'
+plotDiagnostics <- function(scmpObj,
+                            feature_id,
+                            model = "optimized") {
+  # Check if the results exist
+  assert_that(!isEmpty(scmpObj@Significant@genes),
+    msg = paste("No Significant genes found, please run the workflow first")
+  )
+
+  # Check model type
+  assert_that(all(model %in% c("optimized", "full", "intercept")),
+    msg = paste("The requested gene is not available in the significant genes list")
+  )
+
+  # Get all the available genes
+  avail_genes <- unique(unlist(scmpObj@Significant@genes))
+
+  # Check if the requested gene is availble as singificant gene
+  assert_that(all(feature_id %in% avail_genes),
+    msg = paste("The requested gene is not available in the significant genes list")
+  )
+
+  # Get the counts for the genes
+  y_mat <- as.matrix(scmpObj@Dense@assays@data$bulk.counts)
+
+  # Extract gene counts
+  y_df <- y_mat[feature_id, , drop = TRUE] %>%
+    as.data.frame()
+  colnames(y_df) <- "y"
+
+  # Set theme for the plot
+  theme_set <- theme_minimal(base_size = 12) +
+    theme(
+      legend.position = "bottom",
+      panel.grid.major = element_line(
+        color = "grey90", linewidth = 0.3, linetype = "dashed"
+      ),
+      panel.grid.minor = element_blank()
+    )
+
+  # Based on model
+  if (model == "full" || model == "intercept") {
+    # Create Model input
+    regression_matrix <- scmpObj@Design@predictor_matrix
+  } else {
+    # Get coefficients
+    coeff_matrix <- showCoeff(scmpObj) %>%
+      data.frame()
+
+    # Subset
+    coeff_matrix_sub <- coeff_matrix[feature_id, , drop = FALSE]
+    all_terms <- colnames(coeff_matrix_sub)[-1]
+
+    # Remove any occurance of beta
+    all_terms <- str_remove(string = all_terms, pattern = "beta")
+
+    # Remove intercept
+    all_terms <- all_terms[-1]
+
+    # Get the terms
+    terms <- coeff_matrix_sub[apply(coeff_matrix_sub, 2, FUN = function(i) {
+      i != 0
+    })]
+
+    # Extract the prediction
+    prediction_matrix <- scmpObj@Design@predictor_matrix
+
+    # Create Model input
+    regression_matrix <- prediction_matrix[, all_terms, drop = FALSE]
+  }
+
+  # Combine regression matrix with counts
+  model_df <- cbind(y_df, regression_matrix)
+
+  # Extract offsets
+  if (scmpObj@Parameters@offset) {
+    offset_data <- scmpObj@Design@offset
+  } else {
+    offset_data <- NULL
+  }
+
+  if (model == "intercept") {
+    glm_s3 <- glm(y ~ 1,
+      data = as.data.frame(model_df),
+      family = scmpObj@Parameters@distribution,
+      epsilon = scmpObj@Parameters@epsilon,
+      offset = offset_data,
+      weights = NULL,
+      maxit = scmpObj@Parameters@max_it
+    )
+  } else {
+    # Fit Model
+    glm_s3 <- glm(y ~ .,
+      data = as.data.frame(model_df),
+      family = scmpObj@Parameters@distribution,
+      epsilon = scmpObj@Parameters@epsilon,
+      offset = offset_data,
+      weights = NULL,
+      maxit = scmpObj@Parameters@max_it
+    )
+  }
+  # Extract residuals
+  raw_residuals <- residuals(glm_s3, type = "response")
+  pearson_residuals <- residuals(glm_s3, type = "pearson")
+  deviance_residuals <- residuals(glm_s3, type = "deviance")
+  standardized_residuals <- rstandard(glm_s3)
+
+  # Extract fitted values
+  fitted_values <- fitted(glm_s3)
+
+  # Calculate predicted responses
+  predicted_responses <- predict(glm_s3, type = "response")
+
+  # Q-Q plot
+  # This plot of the standardized residuals allows you to check if the
+  # residuals are approximately normally distributed, which is an
+  # Calculate standardized residuals
+
+  # Create a dataframe for plotting
+  qq_df <- data.frame(std_residuals = standardized_residuals)
+
+  # Create the Normal Q-Q plot
+  qq_plot <- ggplot(qq_df, aes(sample = .data$std_residuals)) +
+    stat_qq(color = "#EE446F", size = 2) +
+    stat_qq_line(
+      color = "#15918A", alpha = 1,
+      linewidth = 1, linetype = "solid"
+    ) +
+    labs(
+      subtitle = feature_id,
+      title = "A. Normal Q-Q Plot of Standardized Residuals",
+      x = "Theoretical Quantiles",
+      y = "Standardized Residuals"
+    ) +
+    theme_set
+
+  # Residuals vs Fitted Plot:
+  # This plot helps to detect non-linearity, unequal error variances,
+  # and outliers. It is used to check the assumption of homoscedasticity.
+  # Assuming your glm model is stored in 'glm_s3'
+
+  # Create a dataframe for plotting
+  res_vs_fit_df <- data.frame(
+    Fitted = fitted_values,
+    StdResiduals = standardized_residuals
+  )
+
+  # Create the Residuals vs Fitted plot
+  stdRes_vs_fitted_plot <- ggplot(res_vs_fit_df, aes(x = .data$Fitted, y = .data$StdResiduals)) +
+    geom_point(color = "#EE446F", size = 2) + # Plot the points
+    geom_hline(yintercept = 0, linetype = "dashed", color = "#9F7BB8") + # Add a horizontal line at 0
+    labs(
+      subtitle = feature_id,
+      title = "B. Standardized Residuals vs Fitted Plot", x = "Fitted Values", y = "Standardized Residuals"
+    ) +
+    theme_set +
+    geom_smooth(
+      method = "loess", color = "#15918A", alpha = 0.4, formula = y ~ x,
+      linewidth = 1, linetype = "solid", se = TRUE, fill = "#FDC659"
+    ) # Add a loess smoothed line
+
+  # Scale-Location Plot (or Spread-Location Plot): This shows if residuals
+  # are spread equally along the ranges of predictors. This is a way to check
+  # the assumption of equal variance (homoscedasticity).
+  # Calculate square root of absolute standardized residuals
+  sqrt_abs_std_residuals <- sqrt(abs(standardized_residuals))
+
+  # Create a dataframe for plotting
+  scale_loc_df <- data.frame(
+    Fitted = fitted_values,
+    SqrtAbsStdResiduals = sqrt_abs_std_residuals
+  )
+
+  # Create the Scale-Location plot
+  scale_loc_plot <- ggplot(scale_loc_df, aes(x = .data$Fitted, y = .data$SqrtAbsStdResiduals)) +
+    geom_point(color = "#EE446F", size = 2) + # Plot the points
+    geom_smooth(
+      method = "loess", color = "#15918A", formula = y ~ x,
+      linewidth = 1, linetype = "solid", se = TRUE,
+      fill = "#FDC659", alpha = 0.4
+    ) +
+    labs(
+      subtitle = feature_id,
+      title = "C. Scale-Location Plot",
+      x = "Fitted Values",
+      y = "Sqrt of Absolute Standardized Residuals"
+    ) +
+    theme_set
+
+  # A Dfun plot is less standard and typically refers to a plot that can help
+  # assess the appropriateness of the link function in a GLM. This might involve
+  # plotting observed versus expected responses or derivatives of the link function.
+  # The exact implementation can depend on the specific family and link function used
+  # in your model. For a basic implementation, you might plot observed vs. predicted responses.
+
+  # Create a dataframe for plotting
+  dfun_df <- data.frame(Observed = model_df$y, Predicted = predicted_responses)
+
+  # Create the Dfun plot
+  defunc_plot <- ggplot(dfun_df, aes(x = .data$Observed, y = .data$Predicted)) +
+    geom_point(color = "#EE446F", size = 2) +
+    geom_abline(
+      intercept = 0, slope = 1, linetype = "solid", color = "#15918A",
+      linewidth = 1, alpha = 1
+    ) +
+    labs(
+      subtitle = feature_id,
+      title = "D. Observed vs Predicted",
+      x = "Observed",
+      y = "Predicted"
+    ) +
+    theme_set
+
+  # Return Plots
+  return((qq_plot + stdRes_vs_fitted_plot) / (scale_loc_plot + defunc_plot))
+}
diff --git a/R/sc.p.vector.R b/R/sc.p.vector.R
@@ -17,6 +17,7 @@
 #' @param min_na Minimum values needed per gene across cells to estimate the
 #' model.
 #' @param family  Distribution of the error term.
+#' @param link Type of link function to use in the model. Default is "log".
 #' @param epsilon Model convergence tolerance.
 #' @param offset logical value specifying whether to use offset during fitting.
 #' @param log_offset A logical value specifying whether to take the logarithm of
@@ -48,7 +49,8 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
                         offset = TRUE,
                         parallel = FALSE,
                         log_offset = FALSE,
-                        max_it = 100) {
+                        max_it = 100,
+                        link = "log") {
   # Check the type of the 'design' parameter and set the corresponding variables
   assert_that(is(scmpObj, "ScMaSigPro"),
     msg = "Please provide object of class 'ScMaSigPro'"
@@ -64,6 +66,14 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
   dat <- dat[, as.character(rownames(dis))]
   G <- nrow(dat)
 
+  # Check for the log function
+  assert_that(link %in% c("log", "identity"),
+    msg = "link function should be either 'log' or 'identity'"
+  )
+
+  # Update the family
+  family[["link"]] <- link
+
   # Add check
   # assert_that((dat@Dim[1] > 1), msg = paste(min_na, "for 'min_na' is too high. Try lowering the threshold."))
   assert_that(min_na <= ncol(dat),
@@ -109,8 +119,9 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
     if (log_offset) {
       offsetData <- log(offsetData)
     }
+    scmpObj@Design@offset <- offsetData
   } else {
-    offsetData <- NULL
+    offsetData <- NULL # scmpObj@Design@offset
   }
 
   if (parallel) {
@@ -257,6 +268,7 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH",
     scmpObj@Parameters@mt_correction <- mt_correction
     scmpObj@Parameters@epsilon <- epsilon
     scmpObj@Parameters@distribution <- family
+    scmpObj@Parameters@link <- link
 
     return(scmpObj)
   }