diff --git a/NAMESPACE b/NAMESPACE index ae935a0..1d4629c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(eSparse) export(m3_select_path) export(pathAssign) export(plotBinTile) +export(plotDiagnostics) export(plotIntersect) export(plotTrend) export(plotTrendCluster) @@ -149,6 +150,7 @@ importFrom(stats,cor) importFrom(stats,cutree) importFrom(stats,dist) importFrom(stats,family) +importFrom(stats,fitted) importFrom(stats,gaussian) importFrom(stats,glm) importFrom(stats,glm.control) @@ -159,8 +161,12 @@ importFrom(stats,median) importFrom(stats,na.omit) importFrom(stats,p.adjust) importFrom(stats,poisson) +importFrom(stats,predict) +importFrom(stats,residuals) +importFrom(stats,rstandard) importFrom(stats,sd) importFrom(stats,setNames) +importFrom(stringr,str_remove) importFrom(stringr,str_remove_all) importFrom(stringr,str_split) importFrom(stringr,str_split_i) diff --git a/R/ScMaSigProClass.R b/R/ScMaSigProClass.R index 7fba472..af0f28d 100644 --- a/R/ScMaSigProClass.R +++ b/R/ScMaSigProClass.R @@ -36,6 +36,7 @@ #' model. #' @slot mt_correction A character string specifying the p-value correction #' method. +#' @slot link Type of link function to use in the model. Default is "log". #' @slot epsilon Model convergence tolerance. #' @slot selection_method A character string specifying the method for stepwise #' regression. @@ -73,6 +74,7 @@ setClass( p_value = "numeric", min_na = "numeric", mt_correction = "character", + link = "character", epsilon = "numeric", offset = "logical", log_offset = "logical", @@ -91,7 +93,7 @@ setClass( "bin_ptime_col", "path_prefix", "root_label", "ptime_col", "bin_method", "path_col", "bin_col", "bin_size_col", "bin_mem_col", "mt_correction", "selection_method", "anno_col", "cluster_method", - "use_dim", "fill_na" + "use_dim", "fill_na", "link" ) for (slot_name in char_slots) { @@ -146,6 +148,7 @@ setClass( bin_mem_col = "scmp_bin_members", mt_correction = "BH", selection_method = "backward", + link = "log", epsilon = 1e-8, offset = TRUE, log_offset = FALSE, @@ -172,6 +175,7 @@ setClass( #' @slot assignment_matrix A matrix containing binary assignment to branching paths #' Additionally has two columns for assignment of binned Pseudotime and #' replicate. +#' @slot offset A numeric vector containing the offset values. #' #' @name MatrixDesign #' @aliases MatrixDesign-class @@ -183,12 +187,14 @@ setClass( representation( predictor_matrix = "matrix", groups.vector = "character", - assignment_matrix = "matrix" + assignment_matrix = "matrix", + offset = "numeric" ), prototype = list( predictor_matrix = matrix(NA, nrow = 0, ncol = 0), groups.vector = character(), - assignment_matrix = matrix(NA, nrow = 0, ncol = 0) + assignment_matrix = matrix(NA, nrow = 0, ncol = 0), + offset = numeric() ), validity = function(object) { if (!validObject(object@predictor_matrix)) { @@ -200,6 +206,9 @@ setClass( if (!validObject(object@assignment_matrix)) { stop("assignment_matrix slot is not a valid matrix.") } + if (!validObject(object@offset)) { + stop("offset slot is not a valid vector.") + } TRUE } ) diff --git a/R/plotDiagnostics.R b/R/plotDiagnostics.R new file mode 100644 index 0000000..f552c85 --- /dev/null +++ b/R/plotDiagnostics.R @@ -0,0 +1,235 @@ +#' @title Plot Model Diagnostics +#' +#' @description +#' Plots model residuals for the requested gene. Can be used for model +#' diagnostics of optimized model or full model. +#' +#' @importFrom stats residuals fitted rstandard predict +#' @importFrom stringr str_remove +#' +#' @param scmpObj An object of class \code{\link{ScMaSigPro}}. +#' @param feature_id Name of the gene to be plotted. +#' @param model Type of model to be used for plotting. Can be either 'optimized', +#' 'intercept'and 'full'. Default is 'optimized'. +#' +#' @return ggplot2 plot object. +#' +#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} +#' +#' @export +#' +plotDiagnostics <- function(scmpObj, + feature_id, + model = "optimized") { + # Check if the results exist + assert_that(!isEmpty(scmpObj@Significant@genes), + msg = paste("No Significant genes found, please run the workflow first") + ) + + # Check model type + assert_that(all(model %in% c("optimized", "full", "intercept")), + msg = paste("The requested gene is not available in the significant genes list") + ) + + # Get all the available genes + avail_genes <- unique(unlist(scmpObj@Significant@genes)) + + # Check if the requested gene is availble as singificant gene + assert_that(all(feature_id %in% avail_genes), + msg = paste("The requested gene is not available in the significant genes list") + ) + + # Get the counts for the genes + y_mat <- as.matrix(scmpObj@Dense@assays@data$bulk.counts) + + # Extract gene counts + y_df <- y_mat[feature_id, , drop = TRUE] %>% + as.data.frame() + colnames(y_df) <- "y" + + # Set theme for the plot + theme_set <- theme_minimal(base_size = 12) + + theme( + legend.position = "bottom", + panel.grid.major = element_line( + color = "grey90", linewidth = 0.3, linetype = "dashed" + ), + panel.grid.minor = element_blank() + ) + + # Based on model + if (model == "full" || model == "intercept") { + # Create Model input + regression_matrix <- scmpObj@Design@predictor_matrix + } else { + # Get coefficients + coeff_matrix <- showCoeff(scmpObj) %>% + data.frame() + + # Subset + coeff_matrix_sub <- coeff_matrix[feature_id, , drop = FALSE] + all_terms <- colnames(coeff_matrix_sub)[-1] + + # Remove any occurance of beta + all_terms <- str_remove(string = all_terms, pattern = "beta") + + # Remove intercept + all_terms <- all_terms[-1] + + # Get the terms + terms <- coeff_matrix_sub[apply(coeff_matrix_sub, 2, FUN = function(i) { + i != 0 + })] + + # Extract the prediction + prediction_matrix <- scmpObj@Design@predictor_matrix + + # Create Model input + regression_matrix <- prediction_matrix[, all_terms, drop = FALSE] + } + + # Combine regression matrix with counts + model_df <- cbind(y_df, regression_matrix) + + # Extract offsets + if (scmpObj@Parameters@offset) { + offset_data <- scmpObj@Design@offset + } else { + offset_data <- NULL + } + + if (model == "intercept") { + glm_s3 <- glm(y ~ 1, + data = as.data.frame(model_df), + family = scmpObj@Parameters@distribution, + epsilon = scmpObj@Parameters@epsilon, + offset = offset_data, + weights = NULL, + maxit = scmpObj@Parameters@max_it + ) + } else { + # Fit Model + glm_s3 <- glm(y ~ ., + data = as.data.frame(model_df), + family = scmpObj@Parameters@distribution, + epsilon = scmpObj@Parameters@epsilon, + offset = offset_data, + weights = NULL, + maxit = scmpObj@Parameters@max_it + ) + } + # Extract residuals + raw_residuals <- residuals(glm_s3, type = "response") + pearson_residuals <- residuals(glm_s3, type = "pearson") + deviance_residuals <- residuals(glm_s3, type = "deviance") + standardized_residuals <- rstandard(glm_s3) + + # Extract fitted values + fitted_values <- fitted(glm_s3) + + # Calculate predicted responses + predicted_responses <- predict(glm_s3, type = "response") + + # Q-Q plot + # This plot of the standardized residuals allows you to check if the + # residuals are approximately normally distributed, which is an + # Calculate standardized residuals + + # Create a dataframe for plotting + qq_df <- data.frame(std_residuals = standardized_residuals) + + # Create the Normal Q-Q plot + qq_plot <- ggplot(qq_df, aes(sample = .data$std_residuals)) + + stat_qq(color = "#EE446F", size = 2) + + stat_qq_line( + color = "#15918A", alpha = 1, + linewidth = 1, linetype = "solid" + ) + + labs( + subtitle = feature_id, + title = "A. Normal Q-Q Plot of Standardized Residuals", + x = "Theoretical Quantiles", + y = "Standardized Residuals" + ) + + theme_set + + # Residuals vs Fitted Plot: + # This plot helps to detect non-linearity, unequal error variances, + # and outliers. It is used to check the assumption of homoscedasticity. + # Assuming your glm model is stored in 'glm_s3' + + # Create a dataframe for plotting + res_vs_fit_df <- data.frame( + Fitted = fitted_values, + StdResiduals = standardized_residuals + ) + + # Create the Residuals vs Fitted plot + stdRes_vs_fitted_plot <- ggplot(res_vs_fit_df, aes(x = .data$Fitted, y = .data$StdResiduals)) + + geom_point(color = "#EE446F", size = 2) + # Plot the points + geom_hline(yintercept = 0, linetype = "dashed", color = "#9F7BB8") + # Add a horizontal line at 0 + labs( + subtitle = feature_id, + title = "B. Standardized Residuals vs Fitted Plot", x = "Fitted Values", y = "Standardized Residuals" + ) + + theme_set + + geom_smooth( + method = "loess", color = "#15918A", alpha = 0.4, formula = y ~ x, + linewidth = 1, linetype = "solid", se = TRUE, fill = "#FDC659" + ) # Add a loess smoothed line + + # Scale-Location Plot (or Spread-Location Plot): This shows if residuals + # are spread equally along the ranges of predictors. This is a way to check + # the assumption of equal variance (homoscedasticity). + # Calculate square root of absolute standardized residuals + sqrt_abs_std_residuals <- sqrt(abs(standardized_residuals)) + + # Create a dataframe for plotting + scale_loc_df <- data.frame( + Fitted = fitted_values, + SqrtAbsStdResiduals = sqrt_abs_std_residuals + ) + + # Create the Scale-Location plot + scale_loc_plot <- ggplot(scale_loc_df, aes(x = .data$Fitted, y = .data$SqrtAbsStdResiduals)) + + geom_point(color = "#EE446F", size = 2) + # Plot the points + geom_smooth( + method = "loess", color = "#15918A", formula = y ~ x, + linewidth = 1, linetype = "solid", se = TRUE, + fill = "#FDC659", alpha = 0.4 + ) + + labs( + subtitle = feature_id, + title = "C. Scale-Location Plot", + x = "Fitted Values", + y = "Sqrt of Absolute Standardized Residuals" + ) + + theme_set + + # A Dfun plot is less standard and typically refers to a plot that can help + # assess the appropriateness of the link function in a GLM. This might involve + # plotting observed versus expected responses or derivatives of the link function. + # The exact implementation can depend on the specific family and link function used + # in your model. For a basic implementation, you might plot observed vs. predicted responses. + + # Create a dataframe for plotting + dfun_df <- data.frame(Observed = model_df$y, Predicted = predicted_responses) + + # Create the Dfun plot + defunc_plot <- ggplot(dfun_df, aes(x = .data$Observed, y = .data$Predicted)) + + geom_point(color = "#EE446F", size = 2) + + geom_abline( + intercept = 0, slope = 1, linetype = "solid", color = "#15918A", + linewidth = 1, alpha = 1 + ) + + labs( + subtitle = feature_id, + title = "D. Observed vs Predicted", + x = "Observed", + y = "Predicted" + ) + + theme_set + + # Return Plots + return((qq_plot + stdRes_vs_fitted_plot) / (scale_loc_plot + defunc_plot)) +} diff --git a/R/sc.p.vector.R b/R/sc.p.vector.R index 6951770..80627ac 100644 --- a/R/sc.p.vector.R +++ b/R/sc.p.vector.R @@ -17,6 +17,7 @@ #' @param min_na Minimum values needed per gene across cells to estimate the #' model. #' @param family Distribution of the error term. +#' @param link Type of link function to use in the model. Default is "log". #' @param epsilon Model convergence tolerance. #' @param offset logical value specifying whether to use offset during fitting. #' @param log_offset A logical value specifying whether to take the logarithm of @@ -48,7 +49,8 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", offset = TRUE, parallel = FALSE, log_offset = FALSE, - max_it = 100) { + max_it = 100, + link = "log") { # Check the type of the 'design' parameter and set the corresponding variables assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'ScMaSigPro'" @@ -64,6 +66,14 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", dat <- dat[, as.character(rownames(dis))] G <- nrow(dat) + # Check for the log function + assert_that(link %in% c("log", "identity"), + msg = "link function should be either 'log' or 'identity'" + ) + + # Update the family + family[["link"]] <- link + # Add check # assert_that((dat@Dim[1] > 1), msg = paste(min_na, "for 'min_na' is too high. Try lowering the threshold.")) assert_that(min_na <= ncol(dat), @@ -109,8 +119,9 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", if (log_offset) { offsetData <- log(offsetData) } + scmpObj@Design@offset <- offsetData } else { - offsetData <- NULL + offsetData <- NULL # scmpObj@Design@offset } if (parallel) { @@ -257,6 +268,7 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", scmpObj@Parameters@mt_correction <- mt_correction scmpObj@Parameters@epsilon <- epsilon scmpObj@Parameters@distribution <- family + scmpObj@Parameters@link <- link return(scmpObj) } diff --git a/R/sc.set.poly.R b/R/sc.set.poly.R index 1c90f96..e8e23b6 100644 --- a/R/sc.set.poly.R +++ b/R/sc.set.poly.R @@ -90,11 +90,16 @@ sc.set.poly <- function(scmpObj, group.cols = c(3:ncol(com.cell.meta)) ) + # Create offset + offset_vector <- numeric(rep(nrow(designList$edesign))) + names(offset_vector) <- rownames(designList$edesign) + # Create Object designObj <- new("MatrixDesign", predictor_matrix = as.matrix(designList$dis), groups.vector = designList$groups.vector, - assignment_matrix = as.matrix(designList$edesign) + assignment_matrix = as.matrix(designList$edesign), + offset = offset_vector ) # Update Slot diff --git a/R/sc.t.fit.R b/R/sc.t.fit.R index 99c451a..6c65c6a 100644 --- a/R/sc.t.fit.R +++ b/R/sc.t.fit.R @@ -14,6 +14,7 @@ #' regression. #' @param nvar_correction Argument for correcting significance level. See details. #' @param family Distribution of the error term. +#' @param link Type of link function to use in the model. Default is "log". #' @param epsilon Model convergence tolerance. #' @param offset A logical value specifying whether to use offset during fitting. #' @param log_offset A logical value specifying whether to take the logarithm of @@ -47,11 +48,20 @@ sc.t.fit <- function(scmpObj, verbose = TRUE, parallel = FALSE, log_offset = scmpObj@Parameters@log_offset, - max_it = scmpObj@Parameters@max_it) { + max_it = scmpObj@Parameters@max_it, + link = scmpObj@Parameters@link) { assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'ScMaSigPro'" ) + # Check for the log function + assert_that(link %in% c("log", "identity"), + msg = "link function should be either 'log' or 'identity'" + ) + + # Update the family + family[["link"]] <- link + # Transfer Data dis <- scmpObj@Design@predictor_matrix p_value <- scmpObj@Parameters@p_value @@ -116,8 +126,9 @@ sc.t.fit <- function(scmpObj, if (log_offset) { offsetData <- log(offsetData) } + scmpObj@Design@offset <- offsetData } else { - offsetData <- NULL + offsetData <- NULL # scmpObj@Design@offset } if (parallel) { @@ -509,6 +520,7 @@ sc.t.fit <- function(scmpObj, scmpObj@Parameters@epsilon <- epsilon scmpObj@Parameters@selection_method <- selection_method scmpObj@Parameters@distribution <- family + scmpObj@Parameters@link <- link return(scmpObj) } diff --git a/data/scmp.ob.RData b/data/scmp.ob.RData index 2ea2b76..b830ccb 100644 Binary files a/data/scmp.ob.RData and b/data/scmp.ob.RData differ diff --git a/man/MatrixDesign-class.Rd b/man/MatrixDesign-class.Rd index e5a8277..9978ad3 100644 --- a/man/MatrixDesign-class.Rd +++ b/man/MatrixDesign-class.Rd @@ -20,6 +20,8 @@ term of the polynomial GLM.} \item{\code{assignment_matrix}}{A matrix containing binary assignment to branching paths Additionally has two columns for assignment of binned Pseudotime and replicate.} + +\item{\code{offset}}{A numeric vector containing the offset values.} }} \keyword{classes} diff --git a/man/ParameterConfig-class.Rd b/man/ParameterConfig-class.Rd index baed751..da893a0 100644 --- a/man/ParameterConfig-class.Rd +++ b/man/ParameterConfig-class.Rd @@ -53,6 +53,8 @@ model.} \item{\code{mt_correction}}{A character string specifying the p-value correction method.} +\item{\code{link}}{Type of link function to use in the model. Default is "log".} + \item{\code{epsilon}}{Model convergence tolerance.} \item{\code{selection_method}}{A character string specifying the method for stepwise diff --git a/man/plotDiagnostics.Rd b/man/plotDiagnostics.Rd new file mode 100644 index 0000000..3c55073 --- /dev/null +++ b/man/plotDiagnostics.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotDiagnostics.R +\name{plotDiagnostics} +\alias{plotDiagnostics} +\title{Plot Model Diagnostics} +\usage{ +plotDiagnostics(scmpObj, feature_id, model = "optimized") +} +\arguments{ +\item{scmpObj}{An object of class \code{\link{ScMaSigPro}}.} + +\item{feature_id}{Name of the gene to be plotted.} + +\item{model}{Type of model to be used for plotting. Can be either 'optimized', +'intercept'and 'full'. Default is 'optimized'.} +} +\value{ +ggplot2 plot object. +} +\description{ +Plots model residuals for the requested gene. Can be used for model +diagnostics of optimized model or full model. +} +\author{ +Priyansh Srivastava \email{spriyansh29@gmail.com} +} diff --git a/man/sc.p.vector.Rd b/man/sc.p.vector.Rd index 000dc1e..1482286 100644 --- a/man/sc.p.vector.Rd +++ b/man/sc.p.vector.Rd @@ -15,7 +15,8 @@ sc.p.vector( offset = TRUE, parallel = FALSE, log_offset = FALSE, - max_it = 100 + max_it = 100, + link = "log" ) } \arguments{ @@ -45,6 +46,8 @@ model.} the offsets.} \item{max_it}{Maximum number of iterations to fit the model.} + +\item{link}{Type of link function to use in the model. Default is "log".} } \value{ An object of class \code{\link{ScMaSigPro}}, with updated `Profile` diff --git a/man/sc.t.fit.Rd b/man/sc.t.fit.Rd index 6403b2e..e080f16 100644 --- a/man/sc.t.fit.Rd +++ b/man/sc.t.fit.Rd @@ -15,7 +15,8 @@ sc.t.fit( verbose = TRUE, parallel = FALSE, log_offset = scmpObj@Parameters@log_offset, - max_it = scmpObj@Parameters@max_it + max_it = scmpObj@Parameters@max_it, + link = scmpObj@Parameters@link ) } \arguments{ @@ -43,6 +44,8 @@ regression.} the offsets.} \item{max_it}{Maximum number of iterations to fit the model.} + +\item{link}{Type of link function to use in the model. Default is "log".} } \value{ An object of class \code{\link{ScMaSigPro}}, with updated `Estimate` diff --git a/tests/testthat/test-reproduce-masigpro.R b/tests/testthat/test-reproduce-masigpro.R index 141b7a0..0174501 100644 --- a/tests/testthat/test-reproduce-masigpro.R +++ b/tests/testthat/test-reproduce-masigpro.R @@ -85,12 +85,12 @@ test_that("Reproduce Results of MaSigPro", { # Step-7: Run sc.p.vector test.scmp.1 <- sc.p.vector(test.scmp.1, - min_na = 20, verbose = FALSE, + min_na = 20, verbose = FALSE, link = "identity", offset = FALSE, parallel = FALSE, max_it = 25, epsilon = 0.00001, family = gaussian() ) test.scmp.3 <- sc.p.vector(test.scmp.3, - min_na = 20, verbose = FALSE, + min_na = 20, verbose = FALSE, link = "identity", offset = FALSE, parallel = FALSE, max_it = 25, epsilon = 0.00001, family = gaussian() ) diff --git a/vignettes/S4_Class_object.png b/vignettes/S4_Class_object.png deleted file mode 100644 index 827216f..0000000 Binary files a/vignettes/S4_Class_object.png and /dev/null differ diff --git a/vignettes/scMaSigPro-Class.Rmd b/vignettes/scMaSigPro-Class.Rmd index 4f51ebf..22b040d 100644 --- a/vignettes/scMaSigPro-Class.Rmd +++ b/vignettes/scMaSigPro-Class.Rmd @@ -78,7 +78,7 @@ scmp.ob ## Slots of the scMaSigPro object ```{r, echo=FALSE} -knitr::include_graphics("S4_Class_object.png") +knitr::include_graphics("https://www.metapriyansh.com/scMaSigPro/imgs/S4_Class_object.png") ``` ### `Sparse` \& `Dense` Slots @@ -292,3 +292,4 @@ to other vignettes for more in-depth analysis. ```{r, "Session Info"} sessionInfo(package = "scMaSigPro") ``` + diff --git a/vignettes/scMaSigPro-maSigPro.Rmd b/vignettes/scMaSigPro-maSigPro.Rmd index 8fac00b..f84a57d 100644 --- a/vignettes/scMaSigPro-maSigPro.Rmd +++ b/vignettes/scMaSigPro-maSigPro.Rmd @@ -231,6 +231,7 @@ gc <- capture_output(fit <- p.vector(data.abiotic, design, scmp_ob <- sc.p.vector(scmp_ob, min_na = 20, verbose = FALSE, + link = "identity", offset = FALSE, max_it = 25, epsilon = 0.00001,