diff --git a/DESCRIPTION b/DESCRIPTION index ffeba6f..5a069ae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: scMaSigPro Type: Package Title: Application of MaSigPro Bioconductor Package for scRNA Trajectory data -Version: 0.0.3 +Version: 0.0.4 Authors@R: c( person("Ana", "Conesa", role = c("aut"), email = "ana.conesa@csic.es"), person("Maria Jose", "Nueda", role = c("aut"), email = "mj.nueda@ua.es"), @@ -14,10 +14,9 @@ Description: scMaSigPro is a polynomial regression-based approach inspired by significant expression profile differences among branching paths. License: GPL (>= 2) Imports: - assertthat, e1071, dplyr, entropy, ggplot2, igraph, magrittr, maSigPro, - MASS, MatrixGenerics, methods, parallel, parallelly, plotly, RColorConesa, - rlang, S4Vectors, scales, shiny, SingleCellExperiment, stats, stringr, - utils, mclust + assertthat, e1071, dplyr, entropy, ggplot2, igraph, magrittr, maSigPro (>= 1.74.0), + MASS, MatrixGenerics (>= 1.14.0), methods, parallel, parallelly, plotly, rlang, + S4Vectors (>= 0.40.2), scales, shiny, SingleCellExperiment (>= 1.24.0), stats, stringr, utils, mclust Depends: R (>= 4.0) Encoding: UTF-8 LazyData: true @@ -28,7 +27,6 @@ Suggests: knitr, rmarkdown, BiocStyle, - ComplexUpset, UpSetR, ggpubr Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index beb0120..ae2e77d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(eDense) export(eSparse) export(m3_select_path) export(pathAssign) +export(pb_counts) export(plotBinTile) export(plotDiagnostics) export(plotIntersect) @@ -25,6 +26,7 @@ export(queryCoeff) export(sc.cluster.trend) export(sc.filter) export(sc.p.vector) +export(sc.restruct) export(sc.set.poly) export(sc.squeeze) export(sc.t.fit) @@ -54,15 +56,12 @@ importFrom(MASS,glm.nb) importFrom(MASS,negative.binomial) importFrom(MatrixGenerics,rowMeans) importFrom(MatrixGenerics,rowSums) -importFrom(RColorConesa,colorConesa) -importFrom(RColorConesa,getConesaColors) importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,coolcat) importFrom(S4Vectors,isEmpty) importFrom(SingleCellExperiment,SingleCellExperiment) importFrom(SingleCellExperiment,colData) importFrom(SingleCellExperiment,reducedDims) -importFrom(assertthat,assert_that) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) importFrom(dplyr,filter) @@ -84,7 +83,6 @@ importFrom(entropy,discretize) importFrom(igraph,get.data.frame) importFrom(maSigPro,get.siggenes) importFrom(maSigPro,make.design.matrix) -importFrom(maSigPro,position) importFrom(maSigPro,reg.coeffs) importFrom(magrittr,"%>%") importFrom(mclust,Mclust) @@ -160,10 +158,6 @@ importFrom(stats,residuals) importFrom(stats,rstandard) importFrom(stats,sd) importFrom(stats,setNames) -importFrom(stringr,str_remove) -importFrom(stringr,str_remove_all) -importFrom(stringr,str_split) -importFrom(stringr,str_split_i) importFrom(utils,View) importFrom(utils,combn) importFrom(utils,data) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..118eef1 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,50 @@ +## scMaSigPro 0.0.4 (2024-07-30) + +* Dependency Changes + * Removed hard dependency on `RColorConesa` and `ComplexUpset`. + * Removed `assertthat`, `stringr` from namespace. + * Removed `maSigPro::position` from namespace. + +* Plotting Updates + * Updated plot functions to use a custom palette. + * Updated `plotTrendCluster` function for distinct layers. + * Enhanced `plotIntersect()` with return capabilities. + * Enhanced `plotTrend` with curves, lines, and points. + * Updated ordering by frequency in `UpsetR`. + +* Function Updates + * Added `clean_string()` internal function. + * Added `sc.restruct()` function. + * Exported `pb_helpers()`. + +* Documentation Updates + * General code styling. + * Updated README with citation information. + * Single comprehensive data documentation. + * Added a new vignette. + +* Bug Fixes + * Removed `patchwork` call. + +## scMaSigPro 0.0.3 (2024-03-3) + +* Bug fixes + * `eSparse()` and `eDense()` indexes + * Removed the ComplexUpset error due to ggplot 3.5.0. + +* Remove Package Dependence + * SummarizedExperiment + * ComplexUpset + +* Package Suggestions Added + * Patchwork + * ComplexUpset + * UpSetR + +* Updated Test cases + * Functionality of `eSparse()` and `eDense()` + * Testing `sc.squeeze()` against manual pseudo-bulking + +## scMaSigPro 0.0.1 (2023-12-20) + +* Initial Github release. diff --git a/R/annotate_sce.R b/R/annotate_sce.R index 4c459d5..6962358 100644 --- a/R/annotate_sce.R +++ b/R/annotate_sce.R @@ -7,7 +7,6 @@ #' \pkg{SingleCellExperiment} package. #' #' @importFrom SingleCellExperiment colData -#' @importFrom assertthat assert_that #' #' @param sce A `SingleCellExperiment` object to be annotated. #' @param ptime_col A character string representing the column name @@ -41,7 +40,7 @@ annotate_sce <- function(sce, verbose = TRUE) { # Overwite the columns if (labels_exist) { - assert_that( + assertthat::assert_that( all(!is.null(exist_ptime_col) & !is.null(exist_path_col)), msg = paste( "Requested to set 'path_col' as", path_col, @@ -62,12 +61,12 @@ annotate_sce <- function(sce, cell.meta <- as.data.frame(colData(sce)) # Check columns - assert_that( + assertthat::assert_that( all(exist_ptime_col %in% colnames(cell.meta)), msg = paste("'", exist_ptime_col, "', doesn't exist in colData.") ) # Check columns - assert_that( + assertthat::assert_that( all(exist_path_col %in% colnames(cell.meta)), msg = paste("'", exist_path_col, "', doesn't exist in colData") ) diff --git a/R/as_scmp.R b/R/as_scmp.R index 78e50ff..22bd94d 100644 --- a/R/as_scmp.R +++ b/R/as_scmp.R @@ -6,7 +6,6 @@ #' SingleCellExperiment #' object to an instance of the scmpClass object. #' #' @importFrom SingleCellExperiment SingleCellExperiment -#' @importFrom assertthat assert_that #' #' @param object An S4 object of class `cds/CellDataSet` or `SingleCellExperiment`. #' @param from Character string specifying the class of 'object'. Use "cds" for @@ -65,13 +64,13 @@ as_scmp <- function(object, from = "cds", labels_exist = FALSE )) { # Check Conversion Type - assert_that(from %in% c("cds", "sce"), + assertthat::assert_that(from %in% c("cds", "sce"), msg = ("Currently, accepted options in the 'from' parameter are 'cds' ('cds/CellDataSet' object) and 'sce' ('SingleCellExperiment').") ) # Validate S4 - assert_that( + assertthat::assert_that( all(isS4(object) & all(is(object, "cell_data_set") | is(object, "SingleCellExperiment"))), msg = "Please provide object from one of the class 'cds/CellDataSet', or 'SingleCellExperiment/sce'." @@ -79,18 +78,18 @@ as_scmp <- function(object, from = "cds", # Check and validate additional parameters if (!is.null(additional_params)) { - assert_that(is.list(additional_params), + assertthat::assert_that(is.list(additional_params), msg = "Please provide 'additional_params' as a named list. See details for more information" ) # Check additional parameters if (from == "cds") { - assert_that(names(additional_params) %in% c("reduction_method", "labels_exist", "align_pseudotime_method"), + assertthat::assert_that(names(additional_params) %in% c("reduction_method", "labels_exist", "align_pseudotime_method"), msg = "Allowed additional parameters for 'cds' (cds/CellDataSet) are 'reduction_method', and 'labels_exist','align_pseudotime_method'." ) } else if (from == "sce") { - assert_that(all(names(additional_params) %in% c("exist_ptime_col", "exist_path_col", "labels_exist")), + assertthat::assert_that(all(names(additional_params) %in% c("exist_ptime_col", "exist_path_col", "labels_exist")), msg = "Allowed additional parameters for sce are 'exist_ptime_col', 'exist_path_col', and 'labels_exist'." ) diff --git a/R/create_scmp.R b/R/create_scmp.R index 708b229..649687a 100644 --- a/R/create_scmp.R +++ b/R/create_scmp.R @@ -33,23 +33,33 @@ create_scmp <- function(counts, path_col, use_as_bin = FALSE) { # Validation Checks - assert_that(ncol(counts) == nrow(cell_data), + assertthat::assert_that(ncol(counts) == nrow(cell_data), msg = paste("Number of cells in raw-counts and cell-level-metadata are different.") ) - assert_that(all(colnames(counts) == rownames(cell_data)), + assertthat::assert_that(all(colnames(counts) == rownames(cell_data)), msg = paste("Rownames of raw-counts and cell-level-metadata are different.") ) if (!is.null(bin_counts) || !is.null(bin_cell_data)) { - assert_that(nrow(bin_counts) == nrow(bin_cell_data), + assertthat::assert_that(nrow(bin_counts) == nrow(bin_cell_data), msg = paste("Number of cells in bin_counts and bin_cell_data are different.") ) - assert_that(all(rownames(bin_counts) == rownames(bin_cell_data)), + assertthat::assert_that(all(rownames(bin_counts) == rownames(bin_cell_data)), msg = paste("Rownames of bin_counts and bin_cell_data are different.") ) } + # Validate if path_col is present in cell_data + assertthat::assert_that(path_col %in% colnames(cell_data), + msg = paste("Column name '", path_col, "' not found in cell_data.") + ) + + # Validate if ptime_col is present in cell_data + assertthat::assert_that(ptime_col %in% colnames(cell_data), + msg = paste("Column name '", ptime_col, "' not found in cell_data.") + ) + # Create Single-Cell Experiment Object sparse_tmp <- SingleCellExperiment( list(counts = counts), @@ -61,7 +71,7 @@ create_scmp <- function(counts, Sparse = sparse_tmp, Dense = SingleCellExperiment(assays = list(bulk.counts = matrix(0, nrow = 0, ncol = 0))) ) - sparse_tmp <- NULL + rm(sparse_tmp) # Use as bin if (use_as_bin) { @@ -76,7 +86,7 @@ create_scmp <- function(counts, # Transfer Data scmpObj@Dense <- sparse_tmp - sparse_tmp <- NULL + rm(sparse_tmp) # Update the slots scmpObj@Parameters@bin_ptime_col <- ptime_col diff --git a/R/data_doc.R b/R/data_doc.R new file mode 100644 index 0000000..4f60ef2 --- /dev/null +++ b/R/data_doc.R @@ -0,0 +1,274 @@ +#' @title Simulated SingleCellExperiment Object +#' +#' @description +#' A small simulated SingleCellExperiment Object created using Splatter. +#' This dataset contains 200 cells and 100 genes and is simulated to have +#' a bifurcating topology of the trajectory, useful for testing and development +#' in `scMaSigPro`. The dataset is stored as an `sce` object from the class +#' `SingleCellExperiment` +#' +#' @details +#' The `splat.sce` object was created using the `splatSimulatePaths` function +#' from the Splatter package. The following code was used for the simulation: +#' \preformatted{ +#' # Load Required Packages +#' suppressPackageStartupMessages(library(splatter)) +#' suppressPackageStartupMessages(library(scran)) +#' suppressPackageStartupMessages(library(scuttle)) +#' suppressPackageStartupMessages(library(scater)) +#' suppressPackageStartupMessages(library(SingleCellExperiment)) +#' +#' set.seed(123) +#' +#' # Simulate +#' splat.sim <- splatSimulatePaths( +#' params = newSplatParams( +#' batchCells = 200, nGenes = 100), +#' group.prob = c(0.5, 0.5), +#' path.nSteps = c(100, 100), +#' de.prob = 0.3, de.facLoc = 0.2, +#' path.from = c(0, 0), # Bifurcation +#' verbose = FALSE) +#' +#' # Normalize +#' splat.sim <- logNormCounts(splat.sim, assay.type = "counts") +#' +#' # Reduce Dimensions +#' splat.sim <- runPCA(splat.sim, exprs_values = "logcounts", ncomponents = 2) +#' +#' # Visulize Steps and Groups +#' plotPCA(splat.sim, colour_by = "Step") +#' plotPCA(splat.sim, colour_by = "Group") +#' +#' # Create sce and transfer data +#' sce <- SingleCellExperiment(list(counts = splat.sim@@assays@@data@@listData$counts)) +#' sce@@colData <- splat.sim@@colData +#' rowData(sce) <- rowData(splat.sim) +#' reducedDims(sce) <- reducedDims(splat.sim) +#' splat.sim <- sce +#' +#' # Save +#' save(splat.sim, file = "data/splat.sim.RData") +#' +#' # Compress +#' tools::resaveRdaFiles(paths = "data/") +#' } +#' +#' This simulation creates a dataset with 100 genes and 200 cells, designed to mimic +#' a bifurcating trajectory typically observed in cellular differentiation. +#' +#' @usage +#' # Loading +#' data("splat.sim", package = "scMaSigPro") +#' +#' @format +#' An object of class `SingleCellExperiment` with 100 gene and 200 cells. +#' +#' @source +#' Simulated using the `Splatter` (1.26.0) package. +#' +#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} +#' +"splat.sim" + + +#' @title scMaSigPro Object with results +#' +#' @description +#' A small scMaSigPro object created from simulated data. Please follow Vignette +#' `scMaSigPro: Quick Start Guide` for more details. +#' +#' @usage +#' # Loading +#' data("scmp.ob", package = "scMaSigPro") +#' +#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} +#' +"scmp.ob" + +#' @title Simulated Multifurcating Trajectory SingleCellExperiment Object +#' +#' @description A simulated `CDS` object created using data from the tradeSeq article. +#' This dataset contains simulated single-cell RNA sequencing data with a +#' multifurcating trajectory, useful for testing and development in trajectory analysis +#' methods. The dataset is stored as a `new_cell_data_set` object from the `Monocle3` package. +#' +#' @details +#' +#' The `multi.lin.sce` object was created using the following steps: +#' +#' 1. The multifurcating trajectory data was downloaded from the `tradeSeq` article. +#' 2. The raw counts, cell metadata, and gene metadata were extracted and transformed into a `Monocle3` `cell_data_set` object. +#' 3. The data was preprocessed using PCA for dimensionality reduction and normalized using a log transformation. +#' 4. Both t-SNE and UMAP were used for dimensionality reduction, with t-SNE embeddings stored in the UMAP slots to enable graph learning. +#' 5. Cells were clustered and a principal graph was learned on the data. +#' 6. Pseudotime was inferred based on the learned graph. +#' +#' # Example code used to create `multi.lin.sce`: +#' +#' ```{r} +#' # Load Libraries +#' library(monocle3) +#' library(magrittr) +#' +#' # Download dataset of multi furcating trajectory from tradeSeq Article +#' # wget "https://github.com/statOmics/tradeSeqPaper/raw/master/simulation/sim2_dyntoy_multifurcating_4/multifurcating_4.rds" +#' +#' multi_ob <- readRDS(file = "data/multifurcating_4.rds") +#' +#' # Counts +#' raw_counts <- as.matrix(t(multi_ob[["counts"]])) +#' +#' # Cell Metadata +#' cell_metadata_data <- as.data.frame(multi_ob[["cell_info"]]) +#' rownames(cell_metadata_data) <- cell_metadata_data$cell_id +#' +#' # Gene Metadata +#' gene_metadata_data <- as.data.frame(multi_ob[["feature_info"]]) +#' rownames(gene_metadata_data) <- gene_metadata_data$feature_id +#' gene_metadata_data[["gene_short_name"]] <- gene_metadata_data$feature_id +#' +#' # Convert to Monocle3 CDS +#' cds <- new_cell_data_set( +#' expression_data = raw_counts, +#' cell_metadata = cell_metadata_data, +#' gene_metadata = gene_metadata_data +#' ) +#' +#' # Basic Steps +#' ## Normalize +#' cds <- preprocess_cds(cds, norm_method = "log", +#' method = "PCA", +#' num_dim = 20, +#' pseudo_count = 1, +#' scaling = TRUE, +#' verbose = FALSE) +#' +#' ## Reduce Dimensions +#' set.seed(123) +#' cds <- reduce_dimension(cds, reduction_method = "tSNE", verbose = FALSE, +#' preprocess_method = "PCA", +#' cores = 1) +#' cds <- reduce_dimension(cds, reduction_method = "UMAP", verbose = FALSE, +#' preprocess_method = "PCA", +#' cores = 1) +#' +#' # Overwrite UMAP Slots with tSNE as learn_graph only works on UMAP +#' reducedDims(cds)[["UMAP"]] <- reducedDims(cds)[["tSNE"]] +#' plot_cells(cds) + labs(title = "tSNE", x = "tSNE 1", y = "tSNE 2") +#' +#' ## Compute Clusters +#' cds <- cluster_cells(cds, verbose = FALSE, random_seed = 123, resolution = 0.8) +#' plot_cells(cds, color_cells_by = "cluster", cell_size = 3) + +#' labs(title = "tSNE", xlab = "tSNE 1", ylab = "tSNE 2") +#' plot_cells(cds, color_cells_by = "partition", cell_size = 3) + +#' labs(title = "tSNE", x = "tSNE 1", y = "tSNE 2") +#' +#' # Learn Graph +#' cds <- learn_graph(cds, verbose = FALSE, +#' learn_graph_control = list(minimal_branch_len = 15, +#' prune_graph=TRUE, ncenter=200)) +#' plot_cells(cds, color_cells_by = "cluster", cell_size = 3, +#' label_principal_points = TRUE) + labs(title = "tSNE", x = "tSNE 1", y = "tSNE 2") +#' +#' # Infer Pseudotime +#' cds <- order_cells(cds, root_pr_nodes = "Y_51", verbose = FALSE) +#' p <- plot_cells(cds, color_cells_by = "pseudotime", cell_size = 2,trajectory_graph_color = "red",trajectory_graph_segment_size = 2, +#' label_principal_points = TRUE) + scale_color_viridis_c() + +#' labs(title = "Simulated Multifurcating Trajectory",subtitle = "Simulation: Dyntoy | Latent Space: t-SNE", x = "tSNE-1", y = "tSNE-2", color = "Monocle3 Pseudotime")+ +#' theme(legend.position = "bottom") + geom_point(inherit.aes = TRUE, alpha = 0.5, cex = 0) +#' save(p, file = "extdata/multifurcating_trajectory.RData") +#' +#' ## Follow Steps from +#' ## https://statomics.github.io/tradeSeq/articles/Monocle.html#extracting-the-pseudotimes-and-cell-weights-for-tradeseq-1 +#' +#' # Get the closest vertice for every cell +#' y_to_cells <- principal_graph_aux(cds)$UMAP$pr_graph_cell_proj_closest_vertex %>% +#' as.data.frame() +#' y_to_cells$cells <- rownames(y_to_cells) +#' y_to_cells$Y <- y_to_cells$V1 +#' +#' # Get the root vertices +#' # It is the same node as above +#' root <- cds@principal_graph_aux$UMAP$root_pr_nodes +#' +#' # Extract Mst +#' mst <- principal_graph(cds)$UMAP +#' +#' # Get the other endpoints +#' endpoints <- names(which(igraph::degree(mst) == 1)) +#' endpoints <- endpoints[!endpoints %in% root] +#' +#' # For each endpoint +#' cellWeights <- lapply(endpoints, function(endpoint) { +#' # We find the path between the endpoint and the root +#' path <- igraph::shortest_paths(mst, root, endpoint)$vpath[[1]] +#' path <- as.character(path) +#' # We find the cells that map along that path +#' df <- y_to_cells[y_to_cells$Y %in% path, ] +#' df <- data.frame(weights = as.numeric(colnames(cds) %in% df$cells)) +#' colnames(df) <- endpoint +#' return(df) +#' }) %>% do.call(what = 'cbind', args = .) %>% +#' as.matrix() +#' rownames(cellWeights) <- colnames(cds) +#' colnames(cellWeights) <- paste("path",colnames(cellWeights), sep = "_") +#' +#' # Subset for 3 paths +#' cellWeights <- cellWeights[, c("path_Y_18", "path_Y_52", "path_Y_15"), drop=FALSE] +#' cellWeights <- cellWeights[rowSums(cellWeights) != 0, ] +#' +#' # Create Cell Data +#' cellData <- data.frame( +#' cell_id = rownames(cellWeights), +#' row.names = rownames(cellWeights) +#' ) +#' +#' # Create Cell Metadata +#' cellData[["group"]] <- apply(cellWeights, 1, FUN = function(x) { +#' +#' npath <- length(names(x[x == 1])) +#' +#' if(npath == 3){ +#' return("root") +#' }else if(npath == 2){ +#' return(paste(names(x[x == 1]), collapse = "|")) +#' }else{ +#' return(names(x[x == 1])) +#' } +#' }) +#' +#' # Get counts and Pseudotime +#' counts <- as.matrix(cds@assays@data@listData$counts) +#' counts <- counts[, rownames(cellData), drop=FALSE] +#' +#' # Get Pseudotime +#' pseudotime_vector <- pseudotime(cds) +#' cellData[["Monocle3_Pseudotime"]] <- pseudotime_vector[rownames(cellData)] +#' +#' # Create SingleCellExperiment Object +#' multi.lin.sce <- SingleCellExperiment(assays = list(counts = counts), +#' colData = cellData) +#' +#' # Add dimensionality reduction +#' redDim <- reducedDims(cds)[["tSNE"]] +#' redDim <- redDim[rownames(cellData), , drop=FALSE] +#' reducedDims(multi.lin.sce)[["TSNE"]] <- redDim +#' +#' # Save Object +#' save(multi.lin.sce, file = "data/multi.lin.sce.RData") +#' tools::resaveRdaFiles(paths = "data/") +#' tools::resaveRdaFiles(paths = "extdata/") +#' ``` +#' This dataset includes expression data, cell metadata, and gene metadata, and it is structured to facilitate the application of various trajectory analysis methods. +#' +#' @name multi.lin.sce +#' @docType data +#' @keywords datasets +#' +#' @references +#' TradeSeq package: https://github.com/statOmics/tradeSeq +#' +#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} +#' +"multi.lin.sce" diff --git a/R/m3_select_path.R b/R/m3_select_path.R index 375ad19..fccaa9a 100644 --- a/R/m3_select_path.R +++ b/R/m3_select_path.R @@ -44,7 +44,7 @@ m3_select_path <- function(cds, )) { if (use_shiny == FALSE) { # Check whether the lower dimensions are calculated - assert_that( + assertthat::assert_that( all(names(m3_pp) %in% c( "root_pp", "path1_pp", "path2_pp", "path1_name", "path2_name" @@ -68,16 +68,16 @@ m3_select_path <- function(cds, count <- "count" # Validate is supplied opject is a valid - assert_that(is(cds, "cell_data_set"), + assertthat::assert_that(is(cds, "cell_data_set"), msg = "Please supply a valid monocle3 cdsect" ) # Check whether the lower dimensions are calculated - assert_that(nrow(as.data.frame(reducedDims(cds)[[toupper(latent_space)]])) > 1, + assertthat::assert_that(nrow(as.data.frame(reducedDims(cds)[[toupper(latent_space)]])) > 1, msg = paste(latent_space, "not found, in the cds") ) # Check whether the lower dimensions are calculated - assert_that(nrow(as.data.frame(reducedDims(cds)[[toupper(latent_space)]])) == ncol(cds), + assertthat::assert_that(nrow(as.data.frame(reducedDims(cds)[[toupper(latent_space)]])) == ncol(cds), msg = paste("Dimensions of", latent_space, "do not correspond to dimensions of counts") ) @@ -89,22 +89,22 @@ m3_select_path <- function(cds, dims[["cell"]] <- rownames(dims) # Check whether the lower dimensions are calculated - assert_that(all(rownames(dims) == colnames(cds)), + assertthat::assert_that(all(rownames(dims) == colnames(cds)), msg = paste("Cell Barcodes do not among", latent_space, "and counts") ) # Check if supplied anno_col exist in the cds - assert_that(anno_col %in% names(cds@colData), + assertthat::assert_that(anno_col %in% names(cds@colData), msg = paste(anno_col, "does not exist in the cell.level metadata") ) # Extract the vertex cell relationships - assert_that(!is.null(cds@principal_graph_aux@listData[[toupper(latent_space)]]$pr_graph_cell_proj_closest_vertex), + assertthat::assert_that(!is.null(cds@principal_graph_aux@listData[[toupper(latent_space)]]$pr_graph_cell_proj_closest_vertex), msg = paste("Vertex information is missing") ) # Check pseudotime - assert_that(!is.null(cds@principal_graph_aux@listData[[toupper(latent_space)]][["pseudotime"]]), + assertthat::assert_that(!is.null(cds@principal_graph_aux@listData[[toupper(latent_space)]][["pseudotime"]]), msg = paste("No Pseudotime information found") ) @@ -127,7 +127,7 @@ m3_select_path <- function(cds, ) # Check before merge - assert_that(all(anno.df[["cell"]] == dims[["cell"]]), + assertthat::assert_that(all(anno.df[["cell"]] == dims[["cell"]]), msg = paste("Cells in lower dimensions does not match with cells for which anno_col is supplied") ) @@ -147,10 +147,10 @@ m3_select_path <- function(cds, pTime.frame <- dims <- NULL # Extract the graph and MST - assert_that(!is.null(cds@principal_graph@listData[[toupper(latent_space)]]), + assertthat::assert_that(!is.null(cds@principal_graph@listData[[toupper(latent_space)]]), msg = paste("Principal Graph not found in cds") ) - assert_that(!is.null(cds@principal_graph_aux@listData[[toupper(latent_space)]]$dp_mst), + assertthat::assert_that(!is.null(cds@principal_graph_aux@listData[[toupper(latent_space)]]$dp_mst), msg = paste("MST not found in the cds") ) diff --git a/R/maSigPro_term_selection.R b/R/maSigPro_term_selection.R index feebcb4..3ec8046 100755 --- a/R/maSigPro_term_selection.R +++ b/R/maSigPro_term_selection.R @@ -19,7 +19,10 @@ sc.two.ways.stepback <- function(y = y, d = d, alfa = 0.05, family = gaussian(), d <- d[, names(result)[-1]] while (max > alfa) { varout <- names(result)[result == max] - pos <- position(matrix = d, vari = varout) + # Clean String + varout <- clean_string(varout, action = "remove") + + pos <- maSigPro::position(matrix = d, vari = varout) OUT <- as.data.frame(cbind(OUT, d[, pos])) x <- ncol(OUT) colnames(OUT)[x] <- colnames(d)[pos] @@ -133,7 +136,9 @@ sc.two.ways.stepfor <- max <- max(result2[-1], na.rm = TRUE) while (max > alfa) { varout <- names(result2)[result2 == max] - pos <- position(matrix = design, vari = varout) + # Clean String + varout <- clean_string(varout, action = "remove") + pos <- maSigPro::position(matrix = design, vari = varout) d <- as.data.frame(cbind(d, design[, pos])) x <- ncol(d) colnames(d)[x] <- colnames(design)[pos] @@ -191,11 +196,15 @@ sc.stepback <- function(y = y, d = d, alfa = 0.05, family = gaussian(), epsilon } } while (max > alfa) { - varout <- names(result$coefficients[, 4])[result$coefficients[ + varout <- as.character(names(result$coefficients[, 4])[result$coefficients[ , 4 - ] == max][1] - pos <- position(matrix = d, vari = varout) + ] == max][1]) + + # Clean String + varout <- clean_string(varout, action = "remove") + + pos <- maSigPro::position(matrix = d, vari = paste(varout)) d <- d[, -pos] if (length(result$coefficients[, 4][-1]) == 2) { min <- min(result$coefficients[, 4][-1], na.rm = TRUE) diff --git a/R/pb_helpers.R b/R/pb_helpers.R index ad30aa4..a6275af 100644 --- a/R/pb_helpers.R +++ b/R/pb_helpers.R @@ -13,17 +13,15 @@ #' the "cluster.members" column of the input data frame. #' #' @param x A data frame containing the "cluster.members" column. +#' @param clus_mem_col The name of the column containing the cluster members. #' #' @return A numeric value representing the size of the bin (number of elements #' in the "cluster.members" column). #' #' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} #' -#' @importFrom stringr str_split #' #' @keywords internal -# Define a function 'calc_bin_size' which takes a data frame 'x' as input - calc_bin_size <- function(x, clus_mem_col = "scmp_cluster_members") { # Use the 'str_split' function from the 'stringr' package to split the 'cluster.members' column # of the input data frame 'x' by the '|' character. @@ -31,7 +29,7 @@ calc_bin_size <- function(x, clus_mem_col = "scmp_cluster_members") { # 'c()' is used to concatenate these vectors into a single vector. # Finally, 'length' is used to get the length of this vector (i.e., the number of split strings), # which is stored in the 'size' variable. - size <- length(c(str_split(x[[clus_mem_col]], "\\|"))[[1]]) + size <- length(c(stringr::str_split(x[[clus_mem_col]], "\\|"))[[1]]) # Convert the 'size' variable to a numeric value and return it as the result of the function return(as.numeric(size)) @@ -46,6 +44,7 @@ calc_bin_size <- function(x, clus_mem_col = "scmp_cluster_members") { #' @param vec A character vector where elements may be repeated and #' might contain the value "root". #' @param path_prefix Prefix used to annoate the paths, default is "Path". +#' @param root_label The label for the root element, default is "root". #' #' @return A character vector with the same length as the input where #' unique elements, excluding "root", are renamed to "Path1", "Path2", etc. @@ -80,6 +79,9 @@ convert_to_path <- function(vec, path_prefix, root_label) { #' \item{bin_size}{A numeric column representing the bin size.} #' \item{binned_time}{A numeric column representing the binned time.} #' } +#' @param bin_size_colname The name of the column containing the bin size. +#' @param bin_col The name of the column containing the bin intervals. +#' @param verbose Logical; if TRUE, prints detailed output. #' #' @return A numeric vector containing four elements: #' \describe{ @@ -91,7 +93,6 @@ convert_to_path <- function(vec, path_prefix, root_label) { #' #' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} #' -#' @importFrom stringr str_remove_all #' #' @keywords internal create_range <- function(x, bin_size_colname = "scmp_bin_size", @@ -100,7 +101,7 @@ create_range <- function(x, bin_size_colname = "scmp_bin_size", y <- as.character(x[[bin_col]]) # Remove square and round brackets from the character string - y <- y %>% str_remove_all(pattern = "\\[|\\]|\\(|\\)") + y <- y %>% stringr::str_remove_all(pattern = "\\[|\\]|\\(|\\)") # Split the character string by comma and extract the first element (lower bound of the range) y1 <- as.numeric(sapply(strsplit(y, ","), "[", 1)) @@ -254,7 +255,7 @@ extract_fitting <- function(reg, lmf, model.glm.0, dis, family, name, vars.in, a coeff <- rep(0, (length(vars.in) + 1)) if (length(novar) != 0) { for (m in 1:length(novar)) { - coeff[position(dis, novar[m]) + 1] <- NA + coeff[maSigPro::position(dis, novar[m]) + 1] <- NA } } p.valor <- t <- as.numeric(rep(NA, (length(vars.in) + 1))) @@ -314,8 +315,8 @@ extract_fitting <- function(reg, lmf, model.glm.0, dis, family, name, vars.in, a #' @param nBins Expected number of bins. #' @param bin Column name for the bin column. #' @param bin.size Column name for the bin size column. -#' @param lbond Column name for the lower bound column. -#' @param ubond Column name for the upper bound column. +#' @param lbound Column name for the lower bound column. +#' @param ubound Column name for the upper bound column. #' @keywords internal extract_interval <- function(time.vector, nBins = 1, bin, bin.size, lbound, ubound) { @@ -401,7 +402,7 @@ select_longer_vector <- function(vector1, vector2, #' @param lbound The name of the lower bound column in `bin_table`. #' @param ubound The name of the upper bound column in `bin_table`. #' @param bin The name of the bin identifier column in `bin_table`. -#' @param bin_size The name of the bin size column in `bin_table`. +#' @param bin.size The name of the bin size column in `bin_table`. #' @param method The method for handling small bins: 'merge' to merge with previous or next bin, #' 'drop' to remove small bins, or 'ignore' to leave small bins as they are. #' @param drop The threshold below which a bin is considered too small and subject to the method. @@ -500,7 +501,7 @@ optimize_bin_max <- function(bin_table, max_allowed, verbose = TRUE, #' #' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} #' -#' @keywords internal +#' @export pb_counts <- function(scmpObj, bin_mem_col = scmpObj@Parameters@bin_mem_col, @@ -508,12 +509,12 @@ pb_counts <- function(scmpObj, assay_name = "counts", cluster_count_by = "sum") { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'." ) # Count slot - assert_that( + assertthat::assert_that( all( assay_name %in% names(scmpObj@Sparse@assays@data@listData) ), @@ -526,10 +527,10 @@ pb_counts <- function(scmpObj, # Get Pseudobulk Profile pseudo_bulk_profile <- as.data.frame(colData(scmpObj@Dense)) - assert_that(bin_mem_col %in% colnames(pseudo_bulk_profile), + assertthat::assert_that(bin_mem_col %in% colnames(pseudo_bulk_profile), msg = paste0("'", bin_mem_col, "' does not exist in level.meta.data") ) - assert_that(bin_col %in% colnames(pseudo_bulk_profile), + assertthat::assert_that(bin_col %in% colnames(pseudo_bulk_profile), msg = paste0("'", bin_col, "' does not exist in level.meta.data") ) @@ -542,7 +543,7 @@ pb_counts <- function(scmpObj, bin <- meta.info[i, , drop = FALSE] # Split the row - cell.vector <- c(str_split(bin[1], "\\|"))[[1]] + cell.vector <- c(stringr::str_split(bin[1], "\\|"))[[1]] # Get col cells col_indices <- which(colnames(counts) %in% cell.vector) @@ -554,7 +555,7 @@ pb_counts <- function(scmpObj, pb.vector <- switch(cluster_count_by, "mean" = as.matrix(round(rowMeans(bin_matrix))), "sum" = as.matrix(rowSums(bin_matrix)), - stop("Invalid cluster_count_by value. Please choose either 'mean' or 'sum'.") + stop("Invalid 'aggregate' value. Please choose either 'mean' or 'sum'.") ) # Return diff --git a/R/plotBinTile.R b/R/plotBinTile.R index 6934ccc..0936961 100644 --- a/R/plotBinTile.R +++ b/R/plotBinTile.R @@ -24,27 +24,18 @@ plotBinTile <- function(scmpObj, bin_size_col = scmpObj@Parameters@bin_size_col, bin_ptime_col = scmpObj@Parameters@bin_ptime_col) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'." ) # Check whether the compression data exist or not compression.info <- as.data.frame(colData(scmpObj@Dense)) - # Check for extended data - if (nrow(compression.info) < 1) { - compression.info <- as.data.frame(colData(scmpObj@Sparse)) - } - # Check if values are binned - assert_that(nrow(compression.info) >= 1, - msg = "Please run 'sc.squeeze()' first." + assertthat::assert_that(nrow(as.data.frame(colData(scmpObj@Dense))) >= 1, + msg = "No binning information found. Please run 'sc.squeeze()', first." ) - # get conesa colors - conesa_colors <- getConesaColors()[c(TRUE, FALSE)][c(1:length(unique(compression.info[[path_col]])))] - names(conesa_colors) <- unique(unique(compression.info[[path_col]])) - # Create plot data plt.data <- data.frame( pTime = as.factor(compression.info[[bin_ptime_col]]), diff --git a/R/plotDiagnostics.R b/R/plotDiagnostics.R index f552c85..27a6440 100644 --- a/R/plotDiagnostics.R +++ b/R/plotDiagnostics.R @@ -5,7 +5,6 @@ #' diagnostics of optimized model or full model. #' #' @importFrom stats residuals fitted rstandard predict -#' @importFrom stringr str_remove #' #' @param scmpObj An object of class \code{\link{ScMaSigPro}}. #' @param feature_id Name of the gene to be plotted. @@ -22,12 +21,12 @@ plotDiagnostics <- function(scmpObj, feature_id, model = "optimized") { # Check if the results exist - assert_that(!isEmpty(scmpObj@Significant@genes), + assertthat::assert_that(!isEmpty(scmpObj@Significant@genes), msg = paste("No Significant genes found, please run the workflow first") ) # Check model type - assert_that(all(model %in% c("optimized", "full", "intercept")), + assertthat::assert_that(all(model %in% c("optimized", "full", "intercept")), msg = paste("The requested gene is not available in the significant genes list") ) @@ -35,7 +34,7 @@ plotDiagnostics <- function(scmpObj, avail_genes <- unique(unlist(scmpObj@Significant@genes)) # Check if the requested gene is availble as singificant gene - assert_that(all(feature_id %in% avail_genes), + assertthat::assert_that(all(feature_id %in% avail_genes), msg = paste("The requested gene is not available in the significant genes list") ) @@ -71,7 +70,7 @@ plotDiagnostics <- function(scmpObj, all_terms <- colnames(coeff_matrix_sub)[-1] # Remove any occurance of beta - all_terms <- str_remove(string = all_terms, pattern = "beta") + all_terms <- stringr::str_remove(string = all_terms, pattern = "beta") # Remove intercept all_terms <- all_terms[-1] diff --git a/R/plotIntersect.R b/R/plotIntersect.R index 266af66..13d6b94 100644 --- a/R/plotIntersect.R +++ b/R/plotIntersect.R @@ -2,10 +2,9 @@ #' #' @description #' Generate UpSet Plot on Intersection of Significant Genes from scMaSigPro -#' object. It is a wrapper around `ComplexUpset::upset` and `UpSetR::upset`. +#' object. It is a wrapper `UpSetR::upset`. #' #' @importFrom S4Vectors isEmpty -#' @importFrom RColorConesa colorConesa #' @importFrom utils packageVersion #' #' @param scmpObj An object of class \code{\link{ScMaSigPro}}. @@ -16,53 +15,51 @@ #' @param keep_empty_groups Whether empty sets should be kept (including sets #' which are only empty after filtering by size) #' @param show_sets_size The overall set sizes plot, e.g. from upset_set_size() -#' @param package Which package to use for the UpsetPlot. Options are 'ComplexUpset' -#' or 'UpSetR' (Default). -#' @param verbose Print detailed output in the console. (Default is TRUE) +#' @param return If set to true, it will return dataframe from the UpSetR::fromList(). +#' (Default is TRUE) #' -#' @return ggplot2 plot object for 'ComplexUpset' or upset object for 'UpSetR'. +#' @return upset object for 'UpSetR'. #' #' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} #' #' @export plotIntersect <- function(scmpObj, - package = "UpSetR", - min_intersection_size = 2, - keep_empty_groups = TRUE, + min_intersection_size = 1, + keep_empty_groups = FALSE, width_ratio = 0.1, show_sets_size = FALSE, - verbose = TRUE) { + return = FALSE) { + # Debug + # scmpObj <- multi_scmp_ob + # min_intersection_size <- 2 + # keep_empty_groups <- FALSE + # width_ratio <- 0.1 + # show_sets_size <- FALSE + # verbose <- TRUE + # Check the data - assert_that( + assertthat::assert_that( is(scmpObj, "ScMaSigPro"), msg = "Please supply an object of the class 'ScMaSigPro'" ) # Check if siggenes results exist for groups - assert_that(!isEmpty(scmpObj@Significant@genes), + assertthat::assert_that(!isEmpty(scmpObj@Significant@genes), msg = "'sig.genes@Summary' slot is empty, please run 'sc.get.siggenes'" ) - # Check for possible options - assert_that(package %in% c("ComplexUpset", "UpSetR"), - msg = "Please provide a valid package name for UpSet plot. Options are 'ComplexUpset' or 'UpSetR'" - ) - - # Check if package is installed - if (!requireNamespace(package, quietly = TRUE)) { - stop(paste0("Package '", package, "' is not installed. Please install it first.")) - } else { - if (verbose) { - message(paste0("Using '", package, "' for UpSet plot.")) - } + if (!keep_empty_groups) { + keep_empty_groups <- NULL } gene_list <- scmpObj@Significant@genes - if (package == "UpSetR") { - # Create list object - upset_r_gene_list <- UpSetR::fromList(gene_list) + # Create list object + upset_r_gene_list <- fromListWithNames(gene_list) + if (return) { + return(as.data.frame(upset_r_gene_list)) + } else { # Create Plot p <- UpSetR::upset( upset_r_gene_list, @@ -75,77 +72,10 @@ plotIntersect <- function(scmpObj, shade.color = "purple", text.scale = 1.5, sets.x.label = "Number of Features", - sets.bar.color = "#EE446F" + sets.bar.color = "#EE446F", + keep.order = TRUE, + order.by = "freq" ) return(p) - } else { - # Check version of the ggplot2 - if (packageVersion("ggplot2") >= "3.5.0") { - warning("Please downgrade the ggplot2 to '>= 3.5.0' to use 'ComplesUpset'. We will support the latest version in future. - Visit:'https://github.com/krassowski/complex-upset/issues/195' for more details.") - } else { - # # Create a unique list of all genes - all_genes <- unique(unlist(gene_list)) - - # Initialize the data frame - gene_df <- data.frame(gene = all_genes) - - # Add columns for each pathway - for (pathway in names(gene_list)) { - gene_df[[pathway]] <- gene_df$gene %in% gene_list[[pathway]] - } - - # Binarize Variables and set factors - gene_df[, -1] <- lapply(gene_df[, -1], function(x) as.integer(x)) - - # Get conesa colours - col_pal <- colorConesa(3) - - if (show_sets_size) { - show_sets_size <- ComplexUpset::upset_set_size() - } - - # Create Upset - p <- ComplexUpset::upset( - data = gene_df, - intersect = colnames(gene_df)[-1], - width_ratio = width_ratio, - min_size = min_intersection_size, - keep_empty_groups = keep_empty_groups, - name = "Vars", - # wrap=FALSE, - set_sizes = show_sets_size, - # stripes=c('deepskyblue1'), - matrix = ( - - ComplexUpset::intersection_matrix( - geom = geom_point( - shape = "square", - size = 3.5 - ), - segment = geom_segment( - linetype = "dotted", - color = col_pal[1] - ) - ) - + scale_color_manual( - values = c("TRUE" = col_pal[1], "FALSE" = col_pal[3]), - # labels=c('TRUE'='yes', 'FALSE'='no'), - breaks = c("TRUE", "FALSE") - ) - ), - base_annotations = list( - "Intersection size" = ComplexUpset::intersection_size( - counts = TRUE, - mapping = aes(fill = "bars_color") - ) - + scale_fill_manual(values = c("bars_color" = col_pal[2]), guide = "none") - ) - ) + ggtitle("Intersection of features among paths") + - theme(legend.position = "none", legend.title = element_text(hjust = 0.5)) - - # return plot - return(p) - } } } diff --git a/R/plotTrend.R b/R/plotTrend.R index 553cfd2..6fac16d 100644 --- a/R/plotTrend.R +++ b/R/plotTrend.R @@ -4,7 +4,6 @@ #' Plot trend of the single gene across the binned pseudotime. #' #' @import ggplot2 -#' @importFrom RColorConesa getConesaColors #' #' @param scmpObj An object of class \code{\link{ScMaSigPro}}. #' @param feature_id Name of the gene to be plotted. @@ -20,6 +19,9 @@ #' \code{scMaSigPro::sc.filter()}. (Default is TRUE) #' @param summary_mode Compress the expression values per replicate (if present) #' per binned pseudotime point. Default is 'median'. Other option 'mean' +#' @param curves Whether to plot the fitted curves. (Default is TRUE) +#' @param lines Whether to plot the lines. (Default is FALSE) +#' @param points Whether to plot the points. (Default is TRUE) #' #' @return ggplot2 plot object. #' @@ -35,7 +37,29 @@ plotTrend <- function(scmpObj, logType = "log", pseudoCount = 1, significant = TRUE, - summary_mode = "median") { + summary_mode = "median", + curves = TRUE, + lines = FALSE, + points = TRUE) { + # Debugg + # scmpObj <- multi_scmp_ob_A + # feature_id <- gene_br_Y_18[1] + # xlab <- "Pooled Pseudotime" + # ylab <- "Pseudobulk Expression" + # plot <- "counts" + # summary_mode <- "median" + # logs <- FALSE + # logType <- "log" + # smoothness <- 1 + # includeInflu <- TRUE + # verbose <- TRUE + # pseudoCount <- 1 + # significant <- FALSE + # curves <- TRUE + # lines <- FALSE + # points <- TRUE + # parallel <- FALSE + # Invoke Variables pb.counts <- "pb.counts" pooled.time <- "pooled.time" @@ -45,13 +69,16 @@ plotTrend <- function(scmpObj, offset_vector <- scmpObj@Design@offset # Check summary_mode - assert_that(any(summary_mode %in% c("median", "mean")), + assertthat::assert_that(any(summary_mode %in% c("median", "mean")), msg = paste( paste0("'", summary_mode, "'"), "is not a valid option. Please use one of", paste(c("median", "mean"), collapse = ", ") ) ) + # Check Assertion + assertthat::assert_that(curves || lines || points, msg = "At least one of 'curves', 'lines', or 'points' must be TRUE.") + # Extract edisgn alloc.frame <- scmpObj@Design@assignment_matrix %>% as.data.frame() @@ -59,12 +86,12 @@ plotTrend <- function(scmpObj, bulk.counts <- scmpObj@Dense@assays@data@listData$bulk.counts # Check - assert_that(all(feature_id %in% rownames(bulk.counts)), - msg = "Feature Id doesn't exist please select another one" + assertthat::assert_that(all(feature_id %in% rownames(bulk.counts)), + msg = paste0("'", feature_id, "' doesn't exist please select another one.") ) if (significant) { - assert_that(any(feature_id %in% unique(unlist(scmpObj@Significant@genes))), + assertthat::assert_that(any(feature_id %in% unique(unlist(scmpObj@Significant@genes))), msg = "Feature Id didn't pass the R2 threshold, please re-run sc.filter, with lower a value or set 'significant' to 'FALSE'" ) } @@ -106,7 +133,7 @@ plotTrend <- function(scmpObj, # stop() for (i in path.names) { # Extract Coeff - a <- reg.coeffs( + a <- maSigPro::reg.coeffs( coefficients = betas, groups.vector = groups.vector, group = i @@ -140,8 +167,8 @@ plotTrend <- function(scmpObj, xlim[2] <- max(points.df[[pooled.time]]) - conesa_colors <- getConesaColors()[c(TRUE, FALSE)][c(1:length(unique(points.df[[path]])))] - names(conesa_colors) <- unique(points.df[[path]]) + scmp_pal <- scmp_colors(n = length(path.names)) + names(scmp_pal) <- unique(points.df[[path]]) # Extract sol data.sol <- showSol(scmpObj, view = FALSE, return = TRUE) @@ -153,20 +180,24 @@ plotTrend <- function(scmpObj, } # if log is requestion - if (logs) { - if (logType == "log2") { - points.df$pb.counts <- log2(points.df$pb.counts + pseudoCount) - ylab <- paste0("log2(", ylab, ")") - } else if (logType == "log") { - points.df$pb.counts <- log(points.df$pb.counts + pseudoCount) - ylab <- paste0("log(", ylab, ")") - } else if (logType == "log10") { - points.df$pb.counts <- log10(points.df$pb.counts + pseudoCount) - ylab <- paste0("log10(", ylab, ")") - } else { - stop("'logType' should be one of 'log2', 'log10', 'log'") + suppressWarnings( + expr = { + if (logs) { + if (logType == "log2") { + points.df$pb.counts <- log2(points.df$pb.counts + pseudoCount) + ylab <- paste0("log2(", ylab, ")") + } else if (logType == "log") { + points.df$pb.counts <- log(points.df$pb.counts + pseudoCount) + ylab <- paste0("log(", ylab, ")") + } else if (logType == "log10") { + points.df$pb.counts <- log10(points.df$pb.counts + pseudoCount) + ylab <- paste0("log10(", ylab, ")") + } else { + stop("'logType' should be one of 'log2', 'log10', 'log'") + } + } } - } + ) # Generate line.df line.df <- points.df @@ -174,12 +205,12 @@ plotTrend <- function(scmpObj, # Apply Summary Operation if (summary_mode == "mean") { line.df <- line.df %>% - group_by(pooled.time, path) %>% - summarize(pb.counts = mean(pb.counts), .groups = "drop") + dplyr::group_by(pooled.time, path) %>% + dplyr::summarize(pb.counts = mean(pb.counts), .groups = "drop") } else if (summary_mode == "median") { line.df <- line.df %>% - group_by(pooled.time, path) %>% - summarize(pb.counts = median(pb.counts), .groups = "drop") + dplyr::group_by(pooled.time, path) %>% + dplyr::summarize(pb.counts = median(pb.counts), .groups = "drop") } if (sum(offset_vector) != 0) { @@ -187,17 +218,34 @@ plotTrend <- function(scmpObj, } # Plot + layer_names <- c(NULL) p <- ggplot() + - geom_point(data = points.df, aes(x = pooled.time, y = pb.counts, color = path), fill = "#102C57", alpha = 0.5, size = 2, stroke = 1, shape = 21) + - geom_line(data = line.df, aes(x = pooled.time, y = pb.counts, color = path), linetype = "solid", linewidth = 1, alpha = 0.7) + - geom_line(data = curve.df, aes(x = x, y = y, color = path), linetype = "dashed", linewidth = 1, alpha = 0.7) + ggtitle( paste("Feature Id:", feature_id), subtitle = paste("R2:", round(data.sol[, 2], 3), "| p-Value:", round(data.sol[, 1], 3)) ) + xlab(xlab) + - ylab(ylab) + - theme_classic(base_size = 12) + + ylab(ylab) + names(p$layers) <- layer_names + + if (points) { + p <- p + geom_point(data = points.df, aes(x = pooled.time, y = pb.counts, color = path), fill = "#102C57", alpha = 0.4, size = 1.5, stroke = 1, shape = 21) + layer_names <- c(layer_names, "points") + names(p$layers) <- layer_names + } + if (lines) { + p <- p + geom_line(data = line.df, aes(x = pooled.time, y = pb.counts, color = path), linetype = "dashed", linewidth = 0.6, alpha = 0.7) + layer_names <- c(layer_names, "lines") + names(p$layers) <- layer_names + } + if (curves) { + p <- p + geom_line(data = curve.df, aes(x = x, y = y, color = path), linetype = "solid", linewidth = 0.7, alpha = 0.8) + layer_names <- c(layer_names, "curves") + names(p$layers) <- layer_names + } + + + p <- p + theme_classic(base_size = 12) + theme( legend.position = "bottom", panel.grid.major = element_line(color = "grey90", linewidth = 0.3, linetype = "dashed"), @@ -206,7 +254,6 @@ plotTrend <- function(scmpObj, scale_x_continuous(breaks = seq(min(xlim), max(xlim), by = round(log10(length(points.df[[pooled.time]]))))) + labs(color = "Paths") + # coord_cartesian(xlim = xlim, ylim = ylim) + - scale_color_manual(values = conesa_colors) - # + scale_color_manual(values = scmp_pal) return(p) } diff --git a/R/plotTrendCluster.R b/R/plotTrendCluster.R index 5df001b..c7fa596 100644 --- a/R/plotTrendCluster.R +++ b/R/plotTrendCluster.R @@ -5,9 +5,7 @@ #' #' @import ggplot2 #' @importFrom stats complete.cases cutree hclust -#' @importFrom RColorConesa getConesaColors #' @importFrom mclust Mclust -#' @importFrom stringr str_split_i #' @importFrom stats as.dist cor kmeans #' #' @param scmpObj An object of class \code{\link{ScMaSigPro}}. @@ -29,22 +27,50 @@ #' @param summary_mode Compress the expression values per replicate (if present) #' per binned pseudotime point. Default is 'median'. Other option 'mean' #' @param pseudoCount Add a pseudo-count before taking the log. (Default is 1) +#' @param curves Whether to plot the fitted curves. (Default is TRUE) +#' @param lines Whether to plot the lines. (Default is FALSE) +#' @param points Whether to plot the points. (Default is TRUE) +#' @param loess_span The fraction of the data used when estimating each y-value, +#' when plotting curves. (Default is 0.75) #' #' @return ggplot2 plot object. #' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} #' @export plotTrendCluster <- function(scmpObj, xlab = "Pooled Pseudotime", - ylab = "Pseudobulk Expression", + ylab = "log(Pseudobulk Expression)", plot = "counts", summary_mode = "median", - logs = FALSE, logType = "log", + logs = TRUE, logType = "log", smoothness = 1, includeInflu = TRUE, verbose = TRUE, pseudoCount = 1, significant = FALSE, - parallel = FALSE) { + curves = TRUE, + lines = FALSE, + points = TRUE, + parallel = FALSE, + loess_span = 0.8) { + # # # Debugg + # scmpObj <- multi_scmp_ob_A + # xlab <- "Pooled Pseudotime" + # ylab <- "Pseudobulk Expression" + # plot <- "counts" + # summary_mode <- "median" + # logs <- TRUE + # logType <- "log" + # smoothness <- 1 + # includeInflu <- TRUE + # verbose <- TRUE + # pseudoCount <- 1 + # significant <- FALSE + # c <- curves <- TRUE + # l <- lines <- TRUE + # p <- points <- TRUE + # parallel <- FALSE + + # Global vars scmp_clusters <- "scmp_clusters" feature_id <- "feature_id" @@ -52,11 +78,14 @@ plotTrendCluster <- function(scmpObj, offset_vector <- scmpObj@Design@offset # Check - assert_that(!isEmpty(scmpObj@Significant@clusters), + assertthat::assert_that(!isEmpty(scmpObj@Significant@clusters), msg = "Please run 'sc.cluster.trend', before plotting cluster trends" ) - assert_that(any(plot %in% c("coeff", "counts")), + # Check Assertion + assertthat::assert_that(curves || lines || points, msg = "At least one of 'curves', 'lines', or 'points' must be TRUE.") + + assertthat::assert_that(any(plot %in% c("coeff", "counts")), msg = paste( paste0("'", plot, "'"), "is not a valid option. Please use one of", paste(c("coeff", "counts"), collapse = ", ") @@ -93,7 +122,10 @@ plotTrendCluster <- function(scmpObj, log = logs, log_type = logType, pCount = pseudoCount, sig = significant, - summary = summary_mode) { + summary = summary_mode, + c = curves, + l = lines, + p = points) { # Run per genes plt <- tryCatch( { @@ -106,6 +138,9 @@ plotTrendCluster <- function(scmpObj, pseudoCount = pCount, significant = sig, summary_mode = summary, + curves = c, + lines = l, + points = p ) return(plt) }, @@ -130,16 +165,27 @@ plotTrendCluster <- function(scmpObj, plt.list <- Filter(Negate(is.null), all.plt.list) # Start Traversing - data.list <- lapply(plt.list, function(gene_i.plot) { - # Extract layers - point.data <- gene_i.plot$layers[[1]][["data"]] - line.data <- gene_i.plot$layers[[2]][["data"]] - curve.data <- gene_i.plot$layers[[3]][["data"]] + data.list <- lapply(plt.list, function(gene_i.plot, + c = curves, + l = lines, + p = points) { + point.data <- NULL + line.data <- NULL + curve.data <- NULL - # Set columns - colnames(point.data) <- c("x_axis", "y_axis", "group") - colnames(line.data) <- c("x_axis", "group", "y_axis") - colnames(curve.data) <- c("x_axis", "y_axis", "group") + # Extract layers + if (c) { + curve.data <- gene_i.plot$layers[["curves"]][["data"]] + colnames(curve.data) <- c("x_axis", "y_axis", "group") + } + if (l) { + line.data <- gene_i.plot$layers[["lines"]][["data"]] + colnames(line.data) <- c("x_axis", "y_axis", "group") + } + if (p) { + point.data <- gene_i.plot$layers[["points"]][["data"]] + colnames(point.data) <- c("x_axis", "y_axis", "group") + } return(list( points = point.data, @@ -168,44 +214,72 @@ plotTrendCluster <- function(scmpObj, # Df list collapsed.df <- lapply(cluster.data.list, function(clus_i.list, - summary = summary_mode) { - # Extract Sub.list - line.list <- lapply(clus_i.list, function(line) { - return(line[["line"]]) - }) - point.list <- lapply(clus_i.list, function(line) { - return(line[["points"]]) - }) - curve.list <- lapply(clus_i.list, function(line) { - return(line[["curve"]]) - }) - - # Collapse to Dataframe - line.df <- do.call("rbind", line.list) - point.df <- do.call("rbind", point.list) - curve.df <- do.call("rbind", curve.list) + summary = summary_mode, + c = curves, + l = lines, + p = points) { + point.df <- NULL + line.df <- NULL + curve.df <- NULL + # Define a summarization function + + if (l) { + # Extract Sub.list + line.list <- lapply(clus_i.list, function(line) { + return(line[["line"]]) + }) + line.df <- do.call("rbind", line.list) + } + + if (p) { + point.list <- lapply(clus_i.list, function(point) { + return(point[["points"]]) + }) + point.df <- do.call("rbind", point.list) + } + + if (c) { + curve.list <- lapply(clus_i.list, function(curve) { + return(curve[["curve"]]) + }) + curve.df <- do.call("rbind", curve.list) + } + + # Grouping and summarizing with .data pronoun if (summary == "mean") { - line.df <- line.df %>% - group_by(.data$x_axis, .data$group) %>% - summarize(y_axis = mean(.data$y_axis, na.rm = TRUE), .groups = "drop") - point.df <- point.df %>% - group_by(.data$x_axis, .data$group) %>% - summarize(y_axis = mean(.data$y_axis, na.rm = TRUE), .groups = "drop") - curve.df <- curve.df %>% - group_by(.data$x_axis, .data$group) %>% - summarize(y_axis = mean(.data$y_axis, na.rm = TRUE), .groups = "drop") + if (!is.null(line.df)) { + line.df <- line.df %>% + dplyr::group_by(.data$x_axis, .data$group) %>% + dplyr::summarize(y_axis = mean(.data$y_axis, na.rm = TRUE), .groups = "drop") + } + if (!is.null(point.df)) { + point.df <- point.df %>% + dplyr::group_by(.data$x_axis, .data$group) %>% + dplyr::summarize(y_axis = mean(.data$y_axis, na.rm = TRUE), .groups = "drop") + } + if (!is.null(curve.df)) { + curve.df <- curve.df %>% + dplyr::group_by(.data$x_axis, .data$group) %>% + dplyr::summarize(y_axis = mean(.data$y_axis, na.rm = TRUE), .groups = "drop") + } } else if (summary == "median") { - line.df <- line.df %>% - group_by(.data$x_axis, .data$group) %>% - summarize(y_axis = median(.data$y_axis, na.rm = TRUE), .groups = "drop") - point.df <- point.df %>% - group_by(.data$x_axis, .data$group) %>% - summarize(y_axis = median(.data$y_axis, na.rm = TRUE), .groups = "drop") - curve.df <- curve.df %>% - group_by(.data$x_axis, .data$group) %>% - summarize(y_axis = median(.data$y_axis, na.rm = TRUE), .groups = "drop") + if (!is.null(line.df)) { + line.df <- line.df %>% + dplyr::group_by(.data$x_axis, .data$group) %>% + dplyr::summarize(y_axis = median(.data$y_axis, na.rm = TRUE), .groups = "drop") + } + if (!is.null(point.df)) { + point.df <- point.df %>% + dplyr::group_by(.data$x_axis, .data$group) %>% + dplyr::summarize(y_axis = median(.data$y_axis, na.rm = TRUE), .groups = "drop") + } + if (!is.null(curve.df)) { + curve.df <- curve.df %>% + dplyr::group_by(.data$x_axis, .data$group) %>% + dplyr::summarize(y_axis = median(.data$y_axis, na.rm = TRUE), .groups = "drop") + } } return(list( @@ -225,49 +299,59 @@ plotTrendCluster <- function(scmpObj, cluster_name <- paste0("cluster_", i) # Extract 'line', 'curve', and 'points' data frames and add the 'cluster' column - line_df <- collapsed.df[[i]]$line - line_df$cluster <- cluster_name - lines_list[[i]] <- line_df - - curve_df <- collapsed.df[[i]]$curve - curve_df$cluster <- cluster_name - curves_list[[i]] <- curve_df + if (lines) { + line_df <- collapsed.df[[i]]$line + line_df$cluster <- cluster_name + lines_list[[i]] <- line_df + } - points_df <- collapsed.df[[i]]$points - points_df$cluster <- cluster_name - points_list[[i]] <- points_df + if (curves) { + curve_df <- collapsed.df[[i]]$curve + curve_df$cluster <- cluster_name + curves_list[[i]] <- curve_df + } + if (points) { + points_df <- collapsed.df[[i]]$points + points_df$cluster <- cluster_name + points_list[[i]] <- points_df + } } # Combine the data frames of the same type - lines_combined <- do.call(rbind, lines_list) %>% as.data.frame() - curves_combined <- do.call(rbind, curves_list) %>% as.data.frame() - points_combined <- do.call(rbind, points_list) %>% as.data.frame() + if (lines) { + lines_combined <- do.call(rbind, lines_list) %>% as.data.frame() + lines_combined <- merge(freq.table, lines_combined, by = "cluster") + lines_combined[["cluster"]] <- paste0( + paste("Cluster", stringr::str_split_i(string = lines_combined[["cluster"]], pattern = "_", i = 2), sep = ": "), + " (", lines_combined[["num"]], " Features)" + ) + } + if (curves) { + curves_combined <- do.call(rbind, curves_list) %>% as.data.frame() + curves_combined <- merge(freq.table, curves_combined, by = "cluster") + curves_combined[["cluster"]] <- paste0( + paste("Cluster", stringr::str_split_i(string = curves_combined[["cluster"]], pattern = "_", i = 2), sep = ": "), + " (", curves_combined[["num"]], " Features)" + ) + } + if (points) { + points_combined <- do.call(rbind, points_list) %>% as.data.frame() + points_combined <- merge(freq.table, points_combined, by = "cluster") + # Fix cluster info + points_combined[["cluster"]] <- paste0( + paste("Cluster", stringr::str_split_i(string = points_combined[["cluster"]], pattern = "_", i = 2), sep = ": "), + " (", points_combined[["num"]], " Features)" + ) + } if (verbose) { message("Calculating 'loess', hang tight this can take a while depending on the smoothness...") } - # Add numbers - points_combined <- merge(freq.table, points_combined, by = "cluster") - lines_combined <- merge(freq.table, lines_combined, by = "cluster") - curves_combined <- merge(freq.table, curves_combined, by = "cluster") - - # Fix cluster info - points_combined[["cluster"]] <- paste0( - paste("Cluster", str_split_i(string = points_combined[["cluster"]], pattern = "_", i = 2), sep = ": "), - " (", points_combined[["num"]], " Features)" - ) - lines_combined[["cluster"]] <- paste0( - paste("Cluster", str_split_i(string = lines_combined[["cluster"]], pattern = "_", i = 2), sep = ": "), - " (", lines_combined[["num"]], " Features)" - ) - curves_combined[["cluster"]] <- paste0( - paste("Cluster", str_split_i(string = curves_combined[["cluster"]], pattern = "_", i = 2), sep = ": "), - " (", curves_combined[["num"]], " Features)" - ) - if (sum(offset_vector) != 0) { - lines_combined <- points_combined + if (lines) { + lines_combined <- points_combined + } } @@ -276,33 +360,44 @@ plotTrendCluster <- function(scmpObj, # View(lines_combined) # Initiate plotting - p <- ggplot() + - geom_point( + p <- ggplot() + + if (points) { + p <- p + geom_point( data = points_combined, aes(x = .data$x_axis, y = .data$y_axis, color = .data$group), fill = "#102C57", alpha = 0.5, size = 0.5, stroke = 0.5, shape = 21 - ) + - geom_path( + ) + } + if (lines) { + p <- p + geom_path( data = lines_combined, aes( x = .data$x_axis, y = .data$y_axis, color = .data$group, group = .data$group, - ), linetype = "solid", linewidth = 0.5 - ) + - geom_smooth( + ), linetype = "dashed", linewidth = 0.5 + ) + } + + if (curves) { + p <- p + geom_smooth( data = curves_combined, se = FALSE, - formula = y ~ x, span = 0.7, - method = "loess", + # formula = y ~ s(x , k = scmpObj@Parameters@poly_degree, bs = "cs"), + # method = "gam", + formula = y ~ x, + method = "loess", span = loess_span, aes( x = .data$x_axis, y = .data$y_axis, color = .data$group, group = .data$group, - ), linetype = "dashed", linewidth = 0.5 - ) + - facet_wrap(~ .data$cluster, scales = "free_y") + # Create a panel for each cluster_id - scale_color_manual(values = colorConesa(length(unique(lines_combined$group)))) + # Custom colors for paths + ), linetype = "solid", linewidth = 0.5 + ) + } + + p <- p + facet_wrap(~ .data$cluster, scales = "free_y") + # Create a panel for each cluster_id + scale_color_manual(values = scmp_colors(length(unique(cDense(scmpObj)[[scmpObj@Parameters@path_col]])))) + # Custom colors for paths theme_classic(base_size = 10) + theme( strip.background = element_blank(), strip.text.x = element_text(size = 10, angle = 0), - legend.position = "bottom", legend.title.align = 0.5, + legend.position = "bottom", legend.title = element_text(hjust = 0.5), panel.grid.major = element_line(color = "grey90", linewidth = 0.3, linetype = "dashed"), panel.grid.minor = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis text if necessary diff --git a/R/queryCoeff.R b/R/queryCoeff.R index 0e9afe7..3e70572 100644 --- a/R/queryCoeff.R +++ b/R/queryCoeff.R @@ -31,17 +31,17 @@ queryCoeff <- function(scmpObj, strictly = FALSE, verbose = TRUE) { # Check Validity of the object - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'ScMaSigPro'" ) # Check for group_vector - assert_that(!isEmpty(scmpObj@Design@groups.vector), + assertthat::assert_that(!isEmpty(scmpObj@Design@groups.vector), msg = "'scmpObj@Design@groups.vector' is empty" ) # Check for requested change if (!is.null(change)) { - assert_that( + assertthat::assert_that( all( change %in% c("increasing", "decreasing") ), @@ -50,7 +50,7 @@ queryCoeff <- function(scmpObj, } # Check query - assert_that( + assertthat::assert_that( all( query %in% c("pseudotime", "pseudotime_path", "path", "path_pseudotime") ), @@ -82,7 +82,7 @@ queryCoeff <- function(scmpObj, compare_groups_vector <- unique(scmpObj@Design@groups.vector) # Generate group name vector - avail_groups_vector <- unique(unlist(str_split(compare_groups_vector, "vs"))) + avail_groups_vector <- unique(unlist(stringr::str_split(compare_groups_vector, "vs"))) # Verbose if (verbose) { diff --git a/R/sc.cluster.trend.R b/R/sc.cluster.trend.R index 90fcb61..0571da8 100644 --- a/R/sc.cluster.trend.R +++ b/R/sc.cluster.trend.R @@ -6,7 +6,6 @@ #' #' @import ggplot2 #' @importFrom stats complete.cases cutree hclust -#' @importFrom RColorConesa getConesaColors #' @importFrom mclust Mclust #' @importFrom stats as.dist cor kmeans #' @@ -59,31 +58,31 @@ sc.cluster.trend <- function(scmpObj, scmp_clusters <- "scmp_clusters" # Check if the gene set exists - assert_that(any(geneSet %in% c(names(scmpObj@Significant@genes), "intersect", "union")), + assertthat::assert_that(any(geneSet %in% c(names(scmpObj@Significant@genes), "intersect", "union")), msg = paste( paste0("'", geneSet, "'"), "does not exist. Please use one of", paste(c(names(scmpObj@Significant@genes), "intersect", "union"), collapse = ", ") ) ) - assert_that(any(cluster_by %in% c("coeff", "counts")), + assertthat::assert_that(any(cluster_by %in% c("coeff", "counts")), msg = paste( paste0("'", cluster_by, "'"), "is not a valid option. Please use one of", paste(c("coeff", "counts"), collapse = ", ") ) ) - assert_that(any(use_dim %in% c("row", "col")), + assertthat::assert_that(any(use_dim %in% c("row", "col")), msg = paste( paste0("'", use_dim, "'"), "is not a valid option. Please use one of", paste(c("row", "col"), collapse = ", ") ) ) - assert_that(any(fill_na %in% c("mean", "median", "zero")), + assertthat::assert_that(any(fill_na %in% c("mean", "median", "zero")), msg = paste( paste0("'", fill_na, "'"), "is not a valid option. Please use one of", paste(c("mean", "median", "zero"), collapse = ", ") ) ) - assert_that(any(cluster_method %in% c("hclust", "kmeans", "Mclust")), + assertthat::assert_that(any(cluster_method %in% c("hclust", "kmeans", "Mclust")), msg = paste( paste0("'", cluster_method, "'"), "is not a valid method. Please use one of", paste(c("hclust", "kmeans", "Mclust"), collapse = ", ") diff --git a/R/sc.filter.R b/R/sc.filter.R index 1a160bb..b2d5276 100644 --- a/R/sc.filter.R +++ b/R/sc.filter.R @@ -56,11 +56,11 @@ sc.filter <- function(scmpObj, term_p_value = 0.05, includeInflu = TRUE) { # Check Validity of the object - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'ScMaSigPro'" ) - assert_that( + assertthat::assert_that( all( vars %in% c("all", "each", "groups") ), diff --git a/R/sc.p.vector.R b/R/sc.p.vector.R index f86c923..34855ad 100644 --- a/R/sc.p.vector.R +++ b/R/sc.p.vector.R @@ -55,7 +55,7 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", max_it = 100, link = "log") { # Check the type of the 'design' parameter and set the corresponding variables - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'ScMaSigPro'" ) @@ -70,7 +70,7 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", G <- nrow(dat) # Check for the log function - assert_that(link %in% c("log", "identity"), + assertthat::assert_that(link %in% c("log", "identity"), msg = "link function should be either 'log' or 'identity'" ) @@ -78,8 +78,8 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", family[["link"]] <- link # Add check - # assert_that((dat@Dim[1] > 1), msg = paste(min_na, "for 'min_na' is too high. Try lowering the threshold.")) - assert_that(min_na <= ncol(dat), + # assertthat::assert_that((dat@Dim[1] > 1), msg = paste(min_na, "for 'min_na' is too high. Try lowering the threshold.")) + assertthat::assert_that(min_na <= ncol(dat), msg = paste( min_na, "for 'min_na' is too high. Try lowering the threshold." @@ -129,7 +129,7 @@ sc.p.vector <- function(scmpObj, p_value = 0.05, mt_correction = "BH", } else { n_cores <- as.integer(n_cores) # Check Required Cores - assert_that(n_cores <= availableCores(), + assertthat::assert_that(n_cores <= availableCores(), msg = paste("Number of cores requested is invalid. This session has access to", as.integer(availableCores()), "cores only.") ) numCores <- n_cores diff --git a/R/sc.restruct.R b/R/sc.restruct.R new file mode 100644 index 0000000..9200a5e --- /dev/null +++ b/R/sc.restruct.R @@ -0,0 +1,207 @@ +#' @title Restructure the binned data. +#' +#' @description +#' `sc.restruct()` Add Description +#' +#' @param scmpObj An object of class \code{\link{ScMaSigPro}}. +#' path assignment in 'Sparse' or 'Dense' data. +#' @param end_node_list A list of end nodes in of the branch. +#' @param root_node A character string specifying the root node. +#' @param link_node_list A list of links between two nodes. +#' @param assay_name Name of the Assay in sparse data from which the counts are +#' used. (Default = "counts"). +#' @param link_sep A character string to separate the link nodes. (Default = "_links_") +#' @param aggregate A character string specifying the method to aggregate counts +#' within each cluster. Available options are 'mean' or 'sum'. (Default = "sum"). +#' @param verbose Print detailed output in the console. (Default is TRUE) +#' +#' @return An object of class \code{\link{ScMaSigPro}}, with updated `Dense` +#' slot. +#' +#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} +#' +#' @seealso \code{\link{estBinSize}}, \code{\link{discretize}}, +#' \code{\link{create_range}} +#' +#' @export +sc.restruct <- function(scmpObj, + end_node_list, + root_node, + link_node_list, + assay_name = "counts", + aggregate = "sum", + link_sep = "_links_", + verbose = TRUE) { + # scmpObj <- multi_scmp_ob + # root_node <- "root" + # end_node_list <- list("path_Y_15", "path_Y_18", "path_Y_52") + # link_node_list <- list("path_Y_18|path_Y_15") + # assay_name <- "counts" + # aggregate <- "sum" + # verbose = TRUE + + # Check Object Validity + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), + msg = "Please provide object of class 'scMaSigPro'." + ) + + # Check if values are binned + assertthat::assert_that(nrow(as.data.frame(colData(scmpObj@Dense))) >= 1, + msg = "No binning information found. Please run 'sc.squeeze()', first." + ) + + # Count slot + assertthat::assert_that( + all( + assay_name %in% names(scmpObj@Sparse@assays@data@listData) + ), + msg = paste0("'", assay_name, "' ", "doesn't exit in scmpObj.") + ) + + # Extract bin information + bin_info <- cDense(scmpObj) + + # Set list names with values + names(link_node_list) <- unlist(link_node_list) + names(end_node_list) <- unlist(end_node_list) + + # Invoke Empty dfs + bin_link_tmp <- data.frame() + bin_root_tmp <- data.frame() + + # Add Links to paths + for (i in link_node_list) { + ## remove + # i <- link_node_list[[1]] + + # Extract link group + link_value <- link_node_list[[i]] + names(link_value) <- link_value + + # Split + link_vec <- unlist(stringr::str_split(link_value, link_sep)) + + # Traverse for each of the link + for (j in link_vec) { + ## remove + # j <- link_vec[1] + + # Extract end_bin_info + link_end_bin_info <- bin_info[bin_info[[scmpObj@Parameters@path_col]] == j, ] + + # Extract link_bin_info + link_bin_info <- bin_info[bin_info[[scmpObj@Parameters@path_col]] == i, ] + + # Update group of link_bin_info + link_bin_info[[scmpObj@Parameters@path_col]] <- j + + # Update rowname and bin_col + new_label <- paste(link_bin_info[[scmpObj@Parameters@path_col]], "bin", link_bin_info[[scmpObj@Parameters@bin_ptime_col]], sep = "_") + rownames(link_bin_info) <- new_label + link_bin_info[[scmpObj@Parameters@bin_col]] <- new_label + + # Calculate offset + link_offset <- max(link_bin_info[[scmpObj@Parameters@bin_ptime_col]]) + + # Add new column for scmp_restruct + link_bin_info[["scmp_restruct"]] <- i + + # Add offset to end_bin_info + link_end_bin_info[[scmpObj@Parameters@bin_ptime_col]] <- link_end_bin_info[[scmpObj@Parameters@bin_ptime_col]] + link_offset + + # Update label and bin_col + new_label <- paste(link_end_bin_info[[scmpObj@Parameters@path_col]], "bin", link_end_bin_info[[scmpObj@Parameters@bin_ptime_col]], sep = "_") + link_end_bin_info[[scmpObj@Parameters@bin_col]] <- new_label + rownames(link_end_bin_info) <- new_label + + # Add restructure column + link_end_bin_info[["scmp_restruct"]] <- j + + # Combine + tmp <- rbind(link_bin_info, link_end_bin_info) + + # Rbind + bin_link_tmp <- rbind(bin_link_tmp, tmp) + + + if (verbose) { + message("Linking bins from '", i, "' to path '", j, "'") + } + } + } + + # Create end + names(end_node_list) <- unlist(end_node_list) + + # Extract Root bins + root_node_info <- bin_info[bin_info[[scmpObj@Parameters@path_col]] %in% root_node, ] + root_node_info[["scmp_restruct"]] <- rep("root", nrow(root_node_info)) + root_offset <- max(root_node_info[[scmpObj@Parameters@bin_ptime_col]]) + + # Get rows + binfo_tmp <- bin_info[!(bin_info[[scmpObj@Parameters@path_col]] %in% bin_link_tmp$group), ] + binfo_tmp[["scmp_restruct"]] <- "NA" + bin_link_tmp <- rbind(bin_link_tmp, binfo_tmp) + + # Run end-point wise operation + for (i in end_node_list) { + ## remove + # leaf <- end_node_list[[1]] + + # Extract end_bin_info for leaf + leaf <- end_node_list[[i]] + + # Extract leaf bin info + leaf_bin_info <- bin_link_tmp[bin_link_tmp[[scmpObj@Parameters@path_col]] == leaf, ] + + # Add Offset + leaf_bin_info[[scmpObj@Parameters@bin_ptime_col]] <- leaf_bin_info[[scmpObj@Parameters@bin_ptime_col]] + root_offset + + # Calculate new label + new_label <- paste(leaf_bin_info[[scmpObj@Parameters@path_col]], "bin", leaf_bin_info[[scmpObj@Parameters@bin_ptime_col]], sep = "_") + + # Update + rownames(leaf_bin_info) <- new_label + leaf_bin_info[[scmpObj@Parameters@bin_col]] <- new_label + leaf_bin_info[["scmp_restruct"]] <- leaf + + # Update roor bin info + root_node_info_tmp <- root_node_info + root_node_info_tmp[[scmpObj@Parameters@path_col]] <- leaf + new_label <- paste(root_node_info_tmp[[scmpObj@Parameters@path_col]], "bin", root_node_info_tmp[[scmpObj@Parameters@bin_ptime_col]], sep = "_") + + # Update + rownames(root_node_info_tmp) <- new_label + root_node_info_tmp[[scmpObj@Parameters@bin_col]] <- new_label + + # Add + tmp <- rbind(root_node_info_tmp, leaf_bin_info) + bin_root_tmp <- rbind(bin_root_tmp, tmp) + + if (verbose) { + message("Linking bins root bins to '", i) + } + } + + new_bin_info <- bin_root_tmp + + compressed.sparse <- SingleCellExperiment::SingleCellExperiment(assays = list( + bulk.counts = as( + matrix(NA, nrow = 0, ncol = nrow(new_bin_info)), + "dgCMatrix" + ) + )) + + compressed.sparse@colData <- S4Vectors::DataFrame(new_bin_info) + scmpObj@Dense <- compressed.sparse + + # Get Counts + scmpObj <- pb_counts( + scmpObj = scmpObj, + bin_mem_col = scmpObj@Parameters@bin_mem_col, + bin_col = scmpObj@Parameters@bin_col, + assay_name = assay_name, + cluster_count_by = aggregate + ) + return(scmpObj) +} diff --git a/R/sc.set.poly.R b/R/sc.set.poly.R index e8e23b6..e81b3c4 100644 --- a/R/sc.set.poly.R +++ b/R/sc.set.poly.R @@ -31,7 +31,7 @@ sc.set.poly <- function(scmpObj, bin_ptime_col = scmpObj@Parameters@bin_ptime_col, path_col = scmpObj@Parameters@path_col) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) @@ -39,11 +39,11 @@ sc.set.poly <- function(scmpObj, comp.cell.metadata <- as.data.frame(scmpObj@Dense@colData) # pseudotime_colname - assert_that((bin_ptime_col %in% colnames(comp.cell.metadata)), + assertthat::assert_that((bin_ptime_col %in% colnames(comp.cell.metadata)), msg = paste0("'", bin_ptime_col, "' ", "doesn't exit in cell.metadata.") ) # path_col - assert_that((path_col %in% colnames(comp.cell.metadata)), + assertthat::assert_that((path_col %in% colnames(comp.cell.metadata)), msg = paste0("'", path_col, "' ", "doesn't exit in cell.metadata.") ) diff --git a/R/sc.squeeze.R b/R/sc.squeeze.R index 8dbbf7d..6cca646 100644 --- a/R/sc.squeeze.R +++ b/R/sc.squeeze.R @@ -5,7 +5,6 @@ #' of equal size using entropy-based binning method. It automatically calculates #' the optimal number of bins using one of the supported methods. #' -#' @importFrom assertthat assert_that #' @importFrom parallel mclapply detectCores #' @importFrom entropy discretize #' @importFrom dplyr left_join join_by mutate select bind_rows group_by_at @@ -85,7 +84,7 @@ sc.squeeze <- function(scmpObj, cell <- "cell" # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'." ) @@ -103,7 +102,7 @@ sc.squeeze <- function(scmpObj, ) %in% cols_to_drop, drop = FALSE] # Count slot - assert_that( + assertthat::assert_that( all( assay_name %in% names(scmpObj@Sparse@assays@data@listData) ), @@ -111,22 +110,22 @@ sc.squeeze <- function(scmpObj, ) # Checks - assert_that(ptime_col %in% colnames(raw_cell_metadata), + assertthat::assert_that(ptime_col %in% colnames(raw_cell_metadata), msg = paste0( "'", ptime_col, "' does not exist in cell.level.metadata Please review the 'ptime_col' parameter." ) ) - assert_that(path_col %in% colnames(raw_cell_metadata), + assertthat::assert_that(path_col %in% colnames(raw_cell_metadata), msg = paste0( "'", path_col, "' does not exist in cell.level.metadata. Please review the 'path_col' parameter." ) ) - assert_that(drop_fac >= 0.3, + assertthat::assert_that(drop_fac >= 0.3, msg = "Invalid value for 'drop_fac'. It should be between 0.3 and 1." ) - assert_that( + assertthat::assert_that( all( bin_method %in% c( "Freedman.Diaconis", "Sqrt", "Sturges", "Rice", "Doane", "Scott.Normal" @@ -135,12 +134,12 @@ sc.squeeze <- function(scmpObj, msg = "Available binning methods are 'Freedman.Diaconis', 'Sqrt', 'Sturges', 'Rice', 'Doane', and 'Scott.Normal'" ) if (!is.null(additional_params)) { - assert_that(is.list(additional_params), + assertthat::assert_that(is.list(additional_params), msg = "Please provide 'additional_params' as a named list. See details for more information" ) - assert_that(names(additional_params) %in% c("use_unique_time_points"), + assertthat::assert_that(names(additional_params) %in% c("use_unique_time_points"), msg = "Allowed additional parameters are 'use_unique_time_points'." ) } @@ -152,7 +151,7 @@ sc.squeeze <- function(scmpObj, avail.paths <- as.vector(unique(raw_cell_metadata[[path_col]])) # Check for path - assert_that(length(avail.paths) >= 2, + assertthat::assert_that(length(avail.paths) >= 2, msg = "Invalid number of paths detected. Please make sure that dataset has at least two paths" ) diff --git a/R/sc.t.fit.R b/R/sc.t.fit.R index dde8377..37b91b1 100644 --- a/R/sc.t.fit.R +++ b/R/sc.t.fit.R @@ -5,7 +5,7 @@ #' from the full polynomial model. This function is succeeded by #' \code{scMaSigPro::sc.p.vector()}. #' -#' @importFrom maSigPro position reg.coeffs +#' @importFrom maSigPro reg.coeffs #' @importFrom stats influence.measures #' #' @param scmpObj An object of class \code{\link{ScMaSigPro}}. @@ -53,12 +53,27 @@ sc.t.fit <- function(scmpObj, log_offset = scmpObj@Parameters@log_offset, max_it = scmpObj@Parameters@max_it, link = scmpObj@Parameters@link) { - assert_that(is(scmpObj, "ScMaSigPro"), + # Debugging + # scmpObj <- multi_scmp_ob + # selection_method <- "backward" + # p_value <- scmpObj@Parameters@p_value + # nvar_correction <- FALSE + # family <- scmpObj@Parameters@distribution + # epsilon <- scmpObj@Parameters@epsilon + # offset <- scmpObj@Parameters@offset + # verbose <- TRUE + # parallel <- FALSE + # n_cores <- availableCores() - 2 + # log_offset <- scmpObj@Parameters@log_offset + # max_it <- scmpObj@Parameters@max_it + # link <- scmpObj@Parameters@link + + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'ScMaSigPro'" ) # Check for the log function - assert_that(link %in% c("log", "identity"), + assertthat::assert_that(link %in% c("log", "identity"), msg = "link function should be either 'log' or 'identity'" ) @@ -142,7 +157,7 @@ sc.t.fit <- function(scmpObj, } else { n_cores <- as.integer(n_cores) # Check Required Cores - assert_that(n_cores <= availableCores(), + assertthat::assert_that(n_cores <= availableCores(), msg = paste("Number of cores requested is invalid. This session has access to", as.integer(availableCores()), "cores only.") ) numCores <- n_cores @@ -204,6 +219,7 @@ sc.t.fit <- function(scmpObj, weights = weights_lapply, maxit = max_it_lapply ) + if (parallel == FALSE) { if (verbose_lapply) { if (verbose_lapply) { @@ -227,6 +243,7 @@ sc.t.fit <- function(scmpObj, }, mc.cores = numCores, mc.set.seed = 2023 ) + # }) } else if (selection_method == "forward") { result_list <- parallel::mclapply(names(y_input), diff --git a/R/scmp.ob.R b/R/scmp.ob.R deleted file mode 100644 index cf662a2..0000000 --- a/R/scmp.ob.R +++ /dev/null @@ -1,13 +0,0 @@ -#' @title scMaSigPro Object with results -#' -#' @description -#' A small scMaSigPro object created from simulated data. Please follow Vignette -#' `scMaSigPro: Quick Start Guide` for more details. -#' -#' @usage -#' # Loading -#' data("scmp.ob", package = "scMaSigPro") -#' -#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} -#' -"scmp.ob" diff --git a/R/show_functions.R b/R/show_functions.R index 969dd98..f5f0e85 100644 --- a/R/show_functions.R +++ b/R/show_functions.R @@ -31,12 +31,12 @@ #' @export showCoeff <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = TRUE) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) # Check if the sol exist - assert_that(!all(dim(scmpObj@Estimate@coefficient_matrix) == c(0, 0)), + assertthat::assert_that(!all(dim(scmpObj@Estimate@coefficient_matrix) == c(0, 0)), msg = "'coefficient_matrix' is not computed yet" ) @@ -77,12 +77,12 @@ showCoeff <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = TRUE) #' @export showInflu <- function(scmpObj, view = FALSE, return = TRUE) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) # Check if the sol exist - assert_that(!all(dim(scmpObj@Estimate@influential) == c(0, 0)), + assertthat::assert_that(!all(dim(scmpObj@Estimate@influential) == c(0, 0)), msg = "tscore is not computed yet" ) @@ -119,12 +119,12 @@ showInflu <- function(scmpObj, view = FALSE, return = TRUE) { #' @export showTS <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = TRUE) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) # Check if the sol exist - assert_that(!all(dim(scmpObj@Estimate@t_score_matrix) == c(0, 0)), + assertthat::assert_that(!all(dim(scmpObj@Estimate@t_score_matrix) == c(0, 0)), msg = "tscore is not computed yet" ) @@ -166,12 +166,12 @@ showTS <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = TRUE) { #' @export showSol <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = TRUE) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) # Check if the sol exist - assert_that(!all(dim(scmpObj@Estimate@significance_matrix) == c(0, 0)), + assertthat::assert_that(!all(dim(scmpObj@Estimate@significance_matrix) == c(0, 0)), msg = "'significance_matrix' is not computed yet" ) @@ -216,12 +216,12 @@ showSol <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = TRUE) { showSigProf <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = FALSE) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) # Check if the sol exist - assert_that(!all(dim(scmpObj@Estimate@significance_matrix) == c(0, 0)), + assertthat::assert_that(!all(dim(scmpObj@Estimate@significance_matrix) == c(0, 0)), msg = "Sol is not computed yet" ) @@ -265,12 +265,12 @@ showSigProf <- function(scmpObj, view = FALSE, return = TRUE, #' @export showPoly <- function(scmpObj) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) # Check if the sol exist - assert_that( + assertthat::assert_that( all(!is.na(colnames(scmpObj@Design@predictor_matrix)) | length( colnames(scmpObj@Design@predictor_matrix) > 1 )), @@ -318,7 +318,7 @@ showPoly <- function(scmpObj) { #' @export showParams <- function(scmpObj, view = FALSE, return = TRUE) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) @@ -383,12 +383,12 @@ showParams <- function(scmpObj, view = FALSE, return = TRUE) { #' @export showGroupCoeff <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = TRUE) { # Check Object Validity - assert_that(is(scmpObj, "ScMaSigPro"), + assertthat::assert_that(is(scmpObj, "ScMaSigPro"), msg = "Please provide object of class 'scMaSigPro'" ) # Check if the sol exist - assert_that(!all(dim(scmpObj@Estimate@path_coefficient_matrix) == c(0, 0)), + assertthat::assert_that(!all(dim(scmpObj@Estimate@path_coefficient_matrix) == c(0, 0)), msg = "path_coefficient_matrix is not computed yet" ) @@ -431,10 +431,10 @@ showGroupCoeff <- function(scmpObj, view = FALSE, return = TRUE, includeInflu = cat(paste0("nCells: ", ncol(object@Sparse), "\n")) cat(paste0("nFeatures: ", nrow(object@Sparse), "\n")) cat("Pseudotime Range:", paste(round( - range(colData(object@Sparse)[[object@Parameters@ptime_col]]), 3 + range(SingleCellExperiment::colData(object@Sparse)[[object@Parameters@ptime_col]]), 3 ))) cat(paste("\nBranching Paths:", paste( - unique(colData(object@Sparse)[[object@Parameters@path_col]]), + unique(SingleCellExperiment::colData(object@Sparse)[[object@Parameters@path_col]]), collapse = ", " ))) diff --git a/R/splat.sim.R b/R/splat.sim.R deleted file mode 100644 index b01ac1a..0000000 --- a/R/splat.sim.R +++ /dev/null @@ -1,72 +0,0 @@ -#' @title Simulated SingleCellExperiment Object -#' -#' @description -#' A small simulated SingleCellExperiment Object created using Splatter. -#' This dataset contains 200 cells and 100 genes and is simulated to have -#' a bifurcating topology of the trajectory, useful for testing and development -#' in `scMaSigPro`. The dataset is stored as an `sce` object from the class -#' `SingleCellExperiment` -#' -#' @details -#' The `splat.sce` object was created using the `splatSimulatePaths` function -#' from the Splatter package. The following code was used for the simulation: -#' \preformatted{ -#' # Load Required Packages -#' suppressPackageStartupMessages(library(splatter)) -#' suppressPackageStartupMessages(library(scran)) -#' suppressPackageStartupMessages(library(scuttle)) -#' suppressPackageStartupMessages(library(scater)) -#' suppressPackageStartupMessages(library(SingleCellExperiment)) -#' -#' set.seed(123) -#' -#' # Simulate -#' splat.sim <- splatSimulatePaths( -#' params = newSplatParams( -#' batchCells = 200, nGenes = 100), -#' group.prob = c(0.5, 0.5), -#' path.nSteps = c(100, 100), -#' de.prob = 0.3, de.facLoc = 0.2, -#' path.from = c(0, 0), # Bifurcation -#' verbose = FALSE) -#' -#' # Normalize -#' splat.sim <- logNormCounts(splat.sim, assay.type = "counts") -#' -#' # Reduce Dimensions -#' splat.sim <- runPCA(splat.sim, exprs_values = "logcounts", ncomponents = 2) -#' -#' # Visulize Steps and Groups -#' plotPCA(splat.sim, colour_by = "Step") -#' plotPCA(splat.sim, colour_by = "Group") -#' -#' # Create sce and transfer data -#' sce <- SingleCellExperiment(list(counts = splat.sim@@assays@@data@@listData$counts)) -#' sce@@colData <- splat.sim@@colData -#' rowData(sce) <- rowData(splat.sim) -#' reducedDims(sce) <- reducedDims(splat.sim) -#' splat.sim <- sce -#' -#' # Save -#' save(splat.sim, file = "data/splat.sim.RData") -#' -#' # Compress -#' tools::resaveRdaFiles(paths = "data/") -#' } -#' -#' This simulation creates a dataset with 100 genes and 200 cells, designed to mimic -#' a bifurcating trajectory typically observed in cellular differentiation. -#' -#' @usage -#' # Loading -#' data("splat.sim", package = "scMaSigPro") -#' -#' @format -#' An object of class `SingleCellExperiment` with 100 gene and 200 cells. -#' -#' @source -#' Simulated using the `Splatter` (1.26.0) package. -#' -#' @author Priyansh Srivastava \email{spriyansh29@@gmail.com} -#' -"splat.sim" diff --git a/R/utils.R b/R/utils.R index 00f31eb..131bafb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -30,3 +30,87 @@ get_os <- function() { } as.vector(tolower(os)) } + +#' Clean or Check Backticks in a String +#' +#' This function either checks for the presence of backticks in a string or removes them, based on the specified action. +#' +#' @param input_string A character string to be processed. +#' @param action A character string specifying the action to perform. Either "check" to check for backticks or "remove" to remove backticks. Defaults to "check". +#' +#' @return If action is "check", returns a logical value indicating the presence of backticks. If action is "remove", returns the input string with all backticks removed. +#' +#' @keywords internal +clean_string <- function(input_string, action = c("check", "remove")) { + action <- match.arg(action) + + if (action == "check") { + return(stringr::str_detect(input_string, "`")) + } else if (action == "remove") { + # Remove Backticks + processed_string <- stringr::str_replace_all(input_string, "`", "") + return(processed_string) + } +} + + +#' Convert List of Named Vectors to Data Frame with Row Names for UpSetR +#' +#' This function converts a list of named vectors to a data frame compatible with UpSetR, +#' retaining the row names corresponding to the unique elements (genes) present in the input list. +#' +#' @param input A list of named vectors to be converted to a data frame compatible with UpSetR. +#' +#' @return A data frame with binary values indicating the presence (1) or absence (0) of each element +#' in the sets, and row names corresponding to the unique elements. +#' +#' @keywords internal +fromListWithNames <- function(input) { + # Get unique elements (genes) + elements <- unique(unlist(input)) + + # Create binary matrix + data <- unlist(lapply(input, function(x) { + x <- as.vector(match(elements, x)) + })) + data[is.na(data)] <- as.integer(0) + data[data != 0] <- as.integer(1) + data <- data.frame(matrix(data, ncol = length(input), byrow = FALSE)) + + # Filter out rows with no data + data <- data[which(rowSums(data) != 0), ] + + # Set column names + names(data) <- names(input) + + # Set row names + rownames(data) <- elements[which(rowSums(data) != 0)] + + return(data) +} + +#' Get Color Palette +#' +#' This function returns a specified number of contrasting colors from a predefined palette. +#' If the requested number of colors exceeds the length of the predefined palette, +#' additional unique colors from the default R color set are included. +#' +#' @param n An integer specifying the number of colors to return. +#' +#' @return A character vector of hex color codes. +#' +#' @keywords internal +scmp_colors <- function(n) { + # Define Palette + ten_pal <- c("#d95f02", "#4daf4a", "#377eb8", "#E69F00", "#f781bf", "#56B4E9", "#a65628", "#009E73", "#7570b3") + extra_pal <- unique(grDevices::colors(distinct = TRUE)) + extra_pal <- extra_pal[grep("^[a-zA-Z]+$", extra_pal)] + extra_pal <- extra_pal[c(2:length(extra_pal))] + extra_pal <- extra_pal[!extra_pal %in% ten_pal] + + if (n <= 9) { + return(ten_pal[1:n]) + } else { + return(c(ten_pal, extra_pal[1:(n - 9)])) + } +} diff --git a/README.md b/README.md index 1fef302..d58aa8b 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,18 @@ `scMaSigPro` is an R package designed for analyzing single-cell RNA-seq data over pseudotime. Building on the [maSigPro](https://www.bioconductor.org/packages/release/bioc/html/maSigPro.html) package, it identifies genes with significant expression changes across branching paths in a pseudotime-ordered dataset. This guide provides a step-by-step workflow for ScMaSigPro, making it accessible for users. ## Installation + +### Bioconductor and Dependencies +``` +# Install Dependencies +if (!requireNamespace("BiocManager", quietly = TRUE)) +install.packages("BiocManager") +BiocManager::install(version = "3.14") + +BiocManager::install(c('SingleCellExperiment', 'maSigPro', 'MatrixGenerics', 'S4Vectors')) +``` + +### scMaSigPro latest version To install `scMaSigPro` from GitHub, use the following R code: ``` @@ -114,4 +126,6 @@ This project has received funding from the European Union’s Framework Programm ## Citation If you use `scMaSigPro` in your research, please cite: +Priyansh Srivastava, Marta Benegas Coll, Stefan Götz, María José Nueda, Ana Conesa, "scMaSigPro: differential expression analysis along single-cell trajectories"", Bioinformatics, Volume 40, Issue 7, July 2024, btae443, [https://doi.org/10.1093/bioinformatics/btae443](https://doi.org/10.1093/bioinformatics/btae443) + --- diff --git a/_pkgdown.yml b/_pkgdown.yml deleted file mode 100644 index 505307a..0000000 --- a/_pkgdown.yml +++ /dev/null @@ -1,5 +0,0 @@ -url: https://github.com/BioBam/scMaSigPro -logo: man/figures/logo.png -template: - bootstrap: 5 - diff --git a/data/multi.lin.sce.RData b/data/multi.lin.sce.RData new file mode 100644 index 0000000..9e99bc6 Binary files /dev/null and b/data/multi.lin.sce.RData differ diff --git a/man/calc_bin_size.Rd b/man/calc_bin_size.Rd index cb6c09c..1955803 100644 --- a/man/calc_bin_size.Rd +++ b/man/calc_bin_size.Rd @@ -8,6 +8,8 @@ calc_bin_size(x, clus_mem_col = "scmp_cluster_members") } \arguments{ \item{x}{A data frame containing the "cluster.members" column.} + +\item{clus_mem_col}{The name of the column containing the cluster members.} } \value{ A numeric value representing the size of the bin (number of elements diff --git a/man/clean_string.Rd b/man/clean_string.Rd new file mode 100644 index 0000000..d71b672 --- /dev/null +++ b/man/clean_string.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{clean_string} +\alias{clean_string} +\title{Clean or Check Backticks in a String} +\usage{ +clean_string(input_string, action = c("check", "remove")) +} +\arguments{ +\item{input_string}{A character string to be processed.} + +\item{action}{A character string specifying the action to perform. Either "check" to check for backticks or "remove" to remove backticks. Defaults to "check".} +} +\value{ +If action is "check", returns a logical value indicating the presence of backticks. If action is "remove", returns the input string with all backticks removed. +} +\description{ +This function either checks for the presence of backticks in a string or removes them, based on the specified action. +} +\keyword{internal} diff --git a/man/convert_to_path.Rd b/man/convert_to_path.Rd index 6e26040..0abc113 100644 --- a/man/convert_to_path.Rd +++ b/man/convert_to_path.Rd @@ -11,6 +11,8 @@ convert_to_path(vec, path_prefix, root_label) might contain the value "root".} \item{path_prefix}{Prefix used to annoate the paths, default is "Path".} + +\item{root_label}{The label for the root element, default is "root".} } \value{ A character vector with the same length as the input where diff --git a/man/create_range.Rd b/man/create_range.Rd index 378c636..a166061 100644 --- a/man/create_range.Rd +++ b/man/create_range.Rd @@ -18,6 +18,12 @@ create_range( \item{bin_size}{A numeric column representing the bin size.} \item{binned_time}{A numeric column representing the binned time.} }} + +\item{bin_size_colname}{The name of the column containing the bin size.} + +\item{bin_col}{The name of the column containing the bin intervals.} + +\item{verbose}{Logical; if TRUE, prints detailed output.} } \value{ A numeric vector containing four elements: diff --git a/man/extract_interval.Rd b/man/extract_interval.Rd index ba95b7b..657a0fb 100644 --- a/man/extract_interval.Rd +++ b/man/extract_interval.Rd @@ -15,9 +15,9 @@ extract_interval(time.vector, nBins = 1, bin, bin.size, lbound, ubound) \item{bin.size}{Column name for the bin size column.} -\item{lbond}{Column name for the lower bound column.} +\item{lbound}{Column name for the lower bound column.} -\item{ubond}{Column name for the upper bound column.} +\item{ubound}{Column name for the upper bound column.} } \description{ Extract Intervals diff --git a/man/fromListWithNames.Rd b/man/fromListWithNames.Rd new file mode 100644 index 0000000..a6f17b2 --- /dev/null +++ b/man/fromListWithNames.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{fromListWithNames} +\alias{fromListWithNames} +\title{Convert List of Named Vectors to Data Frame with Row Names for UpSetR} +\usage{ +fromListWithNames(input) +} +\arguments{ +\item{input}{A list of named vectors to be converted to a data frame compatible with UpSetR.} +} +\value{ +A data frame with binary values indicating the presence (1) or absence (0) of each element +in the sets, and row names corresponding to the unique elements. +} +\description{ +This function converts a list of named vectors to a data frame compatible with UpSetR, +retaining the row names corresponding to the unique elements (genes) present in the input list. +} +\keyword{internal} diff --git a/man/multi.lin.sce.Rd b/man/multi.lin.sce.Rd new file mode 100644 index 0000000..580cb44 --- /dev/null +++ b/man/multi.lin.sce.Rd @@ -0,0 +1,193 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_doc.R +\docType{data} +\name{multi.lin.sce} +\alias{multi.lin.sce} +\title{Simulated Multifurcating Trajectory SingleCellExperiment Object} +\format{ +An object of class \code{SingleCellExperiment} with 501 rows and 1977 columns. +} +\usage{ +multi.lin.sce +} +\description{ +A simulated `CDS` object created using data from the tradeSeq article. +This dataset contains simulated single-cell RNA sequencing data with a +multifurcating trajectory, useful for testing and development in trajectory analysis + methods. The dataset is stored as a `new_cell_data_set` object from the `Monocle3` package. +} +\details{ +The `multi.lin.sce` object was created using the following steps: + +1. The multifurcating trajectory data was downloaded from the `tradeSeq` article. +2. The raw counts, cell metadata, and gene metadata were extracted and transformed into a `Monocle3` `cell_data_set` object. +3. The data was preprocessed using PCA for dimensionality reduction and normalized using a log transformation. +4. Both t-SNE and UMAP were used for dimensionality reduction, with t-SNE embeddings stored in the UMAP slots to enable graph learning. +5. Cells were clustered and a principal graph was learned on the data. +6. Pseudotime was inferred based on the learned graph. + +# Example code used to create `multi.lin.sce`: + +```{r} +# Load Libraries +library(monocle3) +library(magrittr) + +# Download dataset of multi furcating trajectory from tradeSeq Article +# wget "https://github.com/statOmics/tradeSeqPaper/raw/master/simulation/sim2_dyntoy_multifurcating_4/multifurcating_4.rds" + +multi_ob <- readRDS(file = "data/multifurcating_4.rds") + +# Counts +raw_counts <- as.matrix(t(multi_ob[["counts"]])) + +# Cell Metadata +cell_metadata_data <- as.data.frame(multi_ob[["cell_info"]]) +rownames(cell_metadata_data) <- cell_metadata_data$cell_id + +# Gene Metadata +gene_metadata_data <- as.data.frame(multi_ob[["feature_info"]]) +rownames(gene_metadata_data) <- gene_metadata_data$feature_id +gene_metadata_data[["gene_short_name"]] <- gene_metadata_data$feature_id + +# Convert to Monocle3 CDS +cds <- new_cell_data_set( + expression_data = raw_counts, + cell_metadata = cell_metadata_data, + gene_metadata = gene_metadata_data +) + +# Basic Steps +## Normalize +cds <- preprocess_cds(cds, norm_method = "log", + method = "PCA", + num_dim = 20, + pseudo_count = 1, + scaling = TRUE, + verbose = FALSE) + +## Reduce Dimensions +set.seed(123) +cds <- reduce_dimension(cds, reduction_method = "tSNE", verbose = FALSE, + preprocess_method = "PCA", + cores = 1) +cds <- reduce_dimension(cds, reduction_method = "UMAP", verbose = FALSE, + preprocess_method = "PCA", + cores = 1) + +# Overwrite UMAP Slots with tSNE as learn_graph only works on UMAP +reducedDims(cds)[["UMAP"]] <- reducedDims(cds)[["tSNE"]] +plot_cells(cds) + labs(title = "tSNE", x = "tSNE 1", y = "tSNE 2") + +## Compute Clusters +cds <- cluster_cells(cds, verbose = FALSE, random_seed = 123, resolution = 0.8) +plot_cells(cds, color_cells_by = "cluster", cell_size = 3) + + labs(title = "tSNE", xlab = "tSNE 1", ylab = "tSNE 2") +plot_cells(cds, color_cells_by = "partition", cell_size = 3) + + labs(title = "tSNE", x = "tSNE 1", y = "tSNE 2") + +# Learn Graph +cds <- learn_graph(cds, verbose = FALSE, + learn_graph_control = list(minimal_branch_len = 15, + prune_graph=TRUE, ncenter=200)) +plot_cells(cds, color_cells_by = "cluster", cell_size = 3, + label_principal_points = TRUE) + labs(title = "tSNE", x = "tSNE 1", y = "tSNE 2") + +# Infer Pseudotime +cds <- order_cells(cds, root_pr_nodes = "Y_51", verbose = FALSE) +p <- plot_cells(cds, color_cells_by = "pseudotime", cell_size = 2,trajectory_graph_color = "red",trajectory_graph_segment_size = 2, + label_principal_points = TRUE) + scale_color_viridis_c() + + labs(title = "Simulated Multifurcating Trajectory",subtitle = "Simulation: Dyntoy | Latent Space: t-SNE", x = "tSNE-1", y = "tSNE-2", color = "Monocle3 Pseudotime")+ + theme(legend.position = "bottom") + geom_point(inherit.aes = TRUE, alpha = 0.5, cex = 0) +save(p, file = "extdata/multifurcating_trajectory.RData") + +## Follow Steps from +## https://statomics.github.io/tradeSeq/articles/Monocle.html#extracting-the-pseudotimes-and-cell-weights-for-tradeseq-1 + +# Get the closest vertice for every cell +y_to_cells <- principal_graph_aux(cds)$UMAP$pr_graph_cell_proj_closest_vertex %>% + as.data.frame() +y_to_cells$cells <- rownames(y_to_cells) +y_to_cells$Y <- y_to_cells$V1 + +# Get the root vertices +# It is the same node as above +root <- cds@principal_graph_aux$UMAP$root_pr_nodes + +# Extract Mst +mst <- principal_graph(cds)$UMAP + +# Get the other endpoints +endpoints <- names(which(igraph::degree(mst) == 1)) +endpoints <- endpoints[!endpoints %in% root] + +# For each endpoint +cellWeights <- lapply(endpoints, function(endpoint) { + # We find the path between the endpoint and the root + path <- igraph::shortest_paths(mst, root, endpoint)$vpath[[1]] + path <- as.character(path) + # We find the cells that map along that path + df <- y_to_cells[y_to_cells$Y %in% path, ] + df <- data.frame(weights = as.numeric(colnames(cds) %in% df$cells)) + colnames(df) <- endpoint + return(df) +}) %>% do.call(what = 'cbind', args = .) %>% + as.matrix() +rownames(cellWeights) <- colnames(cds) +colnames(cellWeights) <- paste("path",colnames(cellWeights), sep = "_") + +# Subset for 3 paths +cellWeights <- cellWeights[, c("path_Y_18", "path_Y_52", "path_Y_15"), drop=FALSE] +cellWeights <- cellWeights[rowSums(cellWeights) != 0, ] + +# Create Cell Data +cellData <- data.frame( + cell_id = rownames(cellWeights), + row.names = rownames(cellWeights) +) + +# Create Cell Metadata +cellData[["group"]] <- apply(cellWeights, 1, FUN = function(x) { + + npath <- length(names(x[x == 1])) + + if(npath == 3){ + return("root") + }else if(npath == 2){ + return(paste(names(x[x == 1]), collapse = "|")) + }else{ + return(names(x[x == 1])) + } +}) + +# Get counts and Pseudotime +counts <- as.matrix(cds@assays@data@listData$counts) +counts <- counts[, rownames(cellData), drop=FALSE] + +# Get Pseudotime +pseudotime_vector <- pseudotime(cds) +cellData[["Monocle3_Pseudotime"]] <- pseudotime_vector[rownames(cellData)] + +# Create SingleCellExperiment Object +multi.lin.sce <- SingleCellExperiment(assays = list(counts = counts), + colData = cellData) + +# Add dimensionality reduction +redDim <- reducedDims(cds)[["tSNE"]] +redDim <- redDim[rownames(cellData), , drop=FALSE] +reducedDims(multi.lin.sce)[["TSNE"]] <- redDim + +# Save Object +save(multi.lin.sce, file = "data/multi.lin.sce.RData") +tools::resaveRdaFiles(paths = "data/") +tools::resaveRdaFiles(paths = "extdata/") +``` +This dataset includes expression data, cell metadata, and gene metadata, and it is structured to facilitate the application of various trajectory analysis methods. +} +\references{ +TradeSeq package: https://github.com/statOmics/tradeSeq +} +\author{ +Priyansh Srivastava \email{spriyansh29@gmail.com} +} +\keyword{datasets} diff --git a/man/optimize_bin_max.Rd b/man/optimize_bin_max.Rd index 6799304..6a30026 100644 --- a/man/optimize_bin_max.Rd +++ b/man/optimize_bin_max.Rd @@ -32,12 +32,12 @@ optimize_bin_max( \item{bin}{The name of the bin identifier column in `bin_table`.} +\item{bin.size}{The name of the bin size column in `bin_table`.} + \item{method}{The method for handling small bins: 'merge' to merge with previous or next bin, 'drop' to remove small bins, or 'ignore' to leave small bins as they are.} \item{drop}{The threshold below which a bin is considered too small and subject to the method.} - -\item{bin_size}{The name of the bin size column in `bin_table`.} } \value{ A dataframe with adjusted bins. The structure of the dataframe will be the same as `bin_table` diff --git a/man/pb_counts.Rd b/man/pb_counts.Rd index 3e414b9..02076c8 100644 --- a/man/pb_counts.Rd +++ b/man/pb_counts.Rd @@ -49,4 +49,3 @@ each row is a gene and each column is a bin. \author{ Priyansh Srivastava \email{spriyansh29@gmail.com} } -\keyword{internal} diff --git a/man/plotIntersect.Rd b/man/plotIntersect.Rd index db0e09f..4281b59 100644 --- a/man/plotIntersect.Rd +++ b/man/plotIntersect.Rd @@ -6,20 +6,16 @@ \usage{ plotIntersect( scmpObj, - package = "UpSetR", - min_intersection_size = 2, - keep_empty_groups = TRUE, + min_intersection_size = 1, + keep_empty_groups = FALSE, width_ratio = 0.1, show_sets_size = FALSE, - verbose = TRUE + return = FALSE ) } \arguments{ \item{scmpObj}{An object of class \code{\link{ScMaSigPro}}.} -\item{package}{Which package to use for the UpsetPlot. Options are 'ComplexUpset' -or 'UpSetR' (Default).} - \item{min_intersection_size}{Minimal number of observations in an intersection for it to be included.} @@ -31,14 +27,15 @@ width.} \item{show_sets_size}{The overall set sizes plot, e.g. from upset_set_size()} -\item{verbose}{Print detailed output in the console. (Default is TRUE)} +\item{return}{If set to true, it will return dataframe from the UpSetR::fromList(). +(Default is TRUE)} } \value{ -ggplot2 plot object for 'ComplexUpset' or upset object for 'UpSetR'. +upset object for 'UpSetR'. } \description{ Generate UpSet Plot on Intersection of Significant Genes from scMaSigPro -object. It is a wrapper around `ComplexUpset::upset` and `UpSetR::upset`. +object. It is a wrapper `UpSetR::upset`. } \author{ Priyansh Srivastava \email{spriyansh29@gmail.com} diff --git a/man/plotTrend.Rd b/man/plotTrend.Rd index c951618..2f08948 100644 --- a/man/plotTrend.Rd +++ b/man/plotTrend.Rd @@ -14,7 +14,10 @@ plotTrend( logType = "log", pseudoCount = 1, significant = TRUE, - summary_mode = "median" + summary_mode = "median", + curves = TRUE, + lines = FALSE, + points = TRUE ) } \arguments{ @@ -41,6 +44,12 @@ higher values will result in more linear trends. (Default is 0.01)} \item{summary_mode}{Compress the expression values per replicate (if present) per binned pseudotime point. Default is 'median'. Other option 'mean'} + +\item{curves}{Whether to plot the fitted curves. (Default is TRUE)} + +\item{lines}{Whether to plot the lines. (Default is FALSE)} + +\item{points}{Whether to plot the points. (Default is TRUE)} } \value{ ggplot2 plot object. diff --git a/man/plotTrendCluster.Rd b/man/plotTrendCluster.Rd index 8f6fa46..00e68ad 100644 --- a/man/plotTrendCluster.Rd +++ b/man/plotTrendCluster.Rd @@ -7,17 +7,21 @@ plotTrendCluster( scmpObj, xlab = "Pooled Pseudotime", - ylab = "Pseudobulk Expression", + ylab = "log(Pseudobulk Expression)", plot = "counts", summary_mode = "median", - logs = FALSE, + logs = TRUE, logType = "log", smoothness = 1, includeInflu = TRUE, verbose = TRUE, pseudoCount = 1, significant = FALSE, - parallel = FALSE + curves = TRUE, + lines = FALSE, + points = TRUE, + parallel = FALSE, + loess_span = 0.8 ) } \arguments{ @@ -50,8 +54,17 @@ higher values will result in more linear trends. (Default is 0.01)} \item{significant}{Include gene only if the models are significant based on \code{scMaSigPro::sc.filter()}. (Default is TRUE)} +\item{curves}{Whether to plot the fitted curves. (Default is TRUE)} + +\item{lines}{Whether to plot the lines. (Default is FALSE)} + +\item{points}{Whether to plot the points. (Default is TRUE)} + \item{parallel}{Use forking process to run parallelly. (Default is FALSE) (Currently, Windows is not supported)} + +\item{loess_span}{The fraction of the data used when estimating each y-value, +when plotting curves. (Default is 0.75)} } \value{ ggplot2 plot object. diff --git a/man/sc.restruct.Rd b/man/sc.restruct.Rd new file mode 100644 index 0000000..e19d7b1 --- /dev/null +++ b/man/sc.restruct.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sc.restruct.R +\name{sc.restruct} +\alias{sc.restruct} +\title{Restructure the binned data.} +\usage{ +sc.restruct( + scmpObj, + end_node_list, + root_node, + link_node_list, + assay_name = "counts", + aggregate = "sum", + link_sep = "_links_", + verbose = TRUE +) +} +\arguments{ +\item{scmpObj}{An object of class \code{\link{ScMaSigPro}}. +path assignment in 'Sparse' or 'Dense' data.} + +\item{end_node_list}{A list of end nodes in of the branch.} + +\item{root_node}{A character string specifying the root node.} + +\item{link_node_list}{A list of links between two nodes.} + +\item{assay_name}{Name of the Assay in sparse data from which the counts are +used. (Default = "counts").} + +\item{aggregate}{A character string specifying the method to aggregate counts +within each cluster. Available options are 'mean' or 'sum'. (Default = "sum").} + +\item{link_sep}{A character string to separate the link nodes. (Default = "_links_")} + +\item{verbose}{Print detailed output in the console. (Default is TRUE)} +} +\value{ +An object of class \code{\link{ScMaSigPro}}, with updated `Dense` +slot. +} +\description{ +`sc.restruct()` Add Description +} +\seealso{ +\code{\link{estBinSize}}, \code{\link{discretize}}, +\code{\link{create_range}} +} +\author{ +Priyansh Srivastava \email{spriyansh29@gmail.com} +} diff --git a/man/scmp.ob.Rd b/man/scmp.ob.Rd index 49adca0..9953326 100644 --- a/man/scmp.ob.Rd +++ b/man/scmp.ob.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scmp.ob.R +% Please edit documentation in R/data_doc.R \docType{data} \name{scmp.ob} \alias{scmp.ob} diff --git a/man/scmp_colors.Rd b/man/scmp_colors.Rd new file mode 100644 index 0000000..3b0c926 --- /dev/null +++ b/man/scmp_colors.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{scmp_colors} +\alias{scmp_colors} +\title{Get Color Palette} +\usage{ +scmp_colors(n) +} +\arguments{ +\item{n}{An integer specifying the number of colors to return.} +} +\value{ +A character vector of hex color codes. +} +\description{ +This function returns a specified number of contrasting colors from a predefined palette. +If the requested number of colors exceeds the length of the predefined palette, +additional unique colors from the default R color set are included. +} +\keyword{internal} diff --git a/man/splat.sim.Rd b/man/splat.sim.Rd index e84d76f..c856520 100644 --- a/man/splat.sim.Rd +++ b/man/splat.sim.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/splat.sim.R +% Please edit documentation in R/data_doc.R \docType{data} \name{splat.sim} \alias{splat.sim} diff --git a/scMaSigPro.Rproj b/scMaSigPro.Rproj index 16b0744..0921585 100644 --- a/scMaSigPro.Rproj +++ b/scMaSigPro.Rproj @@ -16,3 +16,5 @@ BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace,vignette + +SpellingDictionary: en_GB diff --git a/vignettes/Basic-Workflow.Rmd b/vignettes/Basic-Workflow.Rmd index a145929..20499f3 100644 --- a/vignettes/Basic-Workflow.Rmd +++ b/vignettes/Basic-Workflow.Rmd @@ -1,5 +1,5 @@ --- -title: "scMaSigPro: Quick Start Guide" +title: "Quick Start Guide" subtitle: "Basic Steps" author: "Priyansh Srivastava" package: "scMaSigPro" @@ -12,7 +12,7 @@ abstract: | small simulated dataset. output: BiocStyle::html_document vignette: > - %\VignetteIndexEntry{scMaSigPro Basic Workflow} + %\VignetteIndexEntry{Quick Start Guide} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -32,9 +32,23 @@ to pinpoint genes exhibiting significant expression profile differences among branching paths and pseudotime. ## Installation + Currently, `scMaSigPro` is available on GitHub and can be installed as follows: -```{r, echo=TRUE, eval=FALSE} +### Bioconductor and Dependencies +```{r, eval=FALSE, echo=TRUE} +# Install Dependencies +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +BiocManager::install(version = "3.14") + +BiocManager::install(c("SingleCellExperiment", "maSigPro", "MatrixGenerics", "S4Vectors")) +``` + +### scMaSigPro latest version +To install `scMaSigPro` from GitHub, use the following R code: +```{r, eval=FALSE, echo=TRUE} # Install devtools if not already installed if (!requireNamespace("devtools", quietly = TRUE)) { install.packages("devtools") @@ -47,7 +61,7 @@ devtools::install_github("BioBam/scMaSigPro", build_manual = TRUE, upgrade = "never", force = TRUE, - quiet = TRUE, + quiet = TRUE ) ``` @@ -319,7 +333,7 @@ By setting the vars parameter to "groups", the function will add genes with $R^2$ >= 0.7 to the object. To explore the number of genes per group, we will make an upset plot: ```{r, "uspet",eval=TRUE, echo=TRUE, fig.width=8, fig.height=6} -plotIntersect(scmp_ob, package = "UpSetR") +plotIntersect(scmp_ob) ``` Here, we observe that 23 genes belong to both Path2vsPath1 and Path1, indicating @@ -329,10 +343,10 @@ uniquely associated with Path2vsPath1. This implies that Path2 has 10 genes that are significantly differentially expressed over time, using Path1 genes as a reference. Let's explore a few of these genes: ```{r, "trend",eval=TRUE, echo=TRUE, fig.width=7, fig.height=7} -FigureA <- plotTrend(scmp_ob, "Gene9", logs = TRUE, logType = "log") -FigureB <- plotTrend(scmp_ob, "Gene95", logs = TRUE, logType = "log") -FigureC <- plotTrend(scmp_ob, "Gene10", logs = TRUE, logType = "log") -FigureD <- plotTrend(scmp_ob, "Gene92", logs = TRUE, logType = "log") +FigureA <- plotTrend(scmp_ob, "Gene9", logs = TRUE, logType = "log", lines = TRUE) +FigureB <- plotTrend(scmp_ob, "Gene95", logs = TRUE, logType = "log", lines = TRUE) +FigureC <- plotTrend(scmp_ob, "Gene10", logs = TRUE, logType = "log", lines = TRUE) +FigureD <- plotTrend(scmp_ob, "Gene92", logs = TRUE, logType = "log", lines = TRUE) ggpubr::ggarrange(FigureA, FigureB, FigureC, FigureD, ncol = 2, nrow = 2, labels = c("A", "B", "C", "D") @@ -375,7 +389,8 @@ plotTrendCluster( scmpObj = scmp_ob, plot = "coeff", logs = TRUE, - verbose = FALSE + verbose = FALSE, + lines = TRUE ) ``` diff --git a/vignettes/Common-Root-Cells.Rmd b/vignettes/Common-Root-Cells.Rmd new file mode 100644 index 0000000..ff800f1 --- /dev/null +++ b/vignettes/Common-Root-Cells.Rmd @@ -0,0 +1,504 @@ +--- +title: "Common Root Cells" +subtitle: "Multi-branch dataset with common root cells" +author: "Priyansh Srivastava" +package: "scMaSigPro" +output: BiocStyle::html_document +vignette: > + %\VignetteIndexEntry{Common Root Cells} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, "setup", include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +knitr::opts_chunk$set(crop = NULL) +``` + +# Introduction +`scMaSigPro` is designed to handle datasets with at least two branches and +requires that cells be assigned exclusively to these branches. This vignette +demonstrates how to evaluate datasets with common root cells using the `scMaSigPro` +approach. We will utilize an object from Monocle3's +[`newCellDataSet`](https://rdrr.io/bioc/monocle/man/newCellDataSet.html) class +in this analysis. + +## Installation + +Currently, `scMaSigPro` is available on GitHub and can be installed as follows: + +### Bioconductor and Dependencies +```{r, eval=FALSE, echo=TRUE} +# Install Dependencies +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +BiocManager::install(version = "3.14") + +BiocManager::install(c("SingleCellExperiment", "maSigPro", "MatrixGenerics", "S4Vectors")) +``` + +### scMaSigPro latest version +To install `scMaSigPro` from GitHub, use the following R code: +```{r, eval=FALSE, echo=TRUE} +# Install devtools if not already installed +if (!requireNamespace("devtools", quietly = TRUE)) { + install.packages("devtools") +} + +# Install scMaSigPro +devtools::install_github("BioBam/scMaSigPro", + ref = "main", + build_vignettes = FALSE, + build_manual = TRUE, + upgrade = "never", + force = TRUE, + quiet = TRUE +) +``` + +## Setup +We will start by loading the necessary libraries. +```{r, echo=TRUE, eval=FALSE, "libs-show"} +## For plotting +library(scMaSigPro) +library(ggplot2) +## Install and load 'monocle3-1.3.4' +library(monocle3) +``` + +```{r, echo=FALSE, eval=TRUE, "libs-show-2"} +library(scMaSigPro) +library(ggplot2) +``` + +## Simulated Data +For this vignette, we will use a simulated dataset containing 1,977 cells and 501 genes. The dataset was generated using the Dyntoy package and is sourced from the [`tradeSeq` repository](https://github.com/statOmics/tradeSeqPaper/raw/master/simulation/sim2_dyntoy_multifurcating_4/multifurcating_4.rds). This dataset has been simulated to have three branches and is analyzed with Monocle3 (version 1.3.4). + +```{r, echo=FALSE, "load png of CDS", fig.align='center', fig.width=10, fig.height=8, fig.alt='Monocle3_Multi_Branch_tSNE'} +knitr::include_graphics("https://www.metapriyansh.com/scMaSigPro/imgs/Monocle3_Multi_Branch_tSNE.png") +``` + +In the figure-A, we can see that the cells share a common root and then diverge +into multiple branches. The cells first diverge into two branches and then into +three branches. Figure B illustrates the elements of the trajectory used as input +in `scMaSigPro` for the common root cells setup. + +## Hard Assignment of Cells +An important prerequisite for using `scMaSigPro` is the hard assignment of cells +to branches. This means that each cell must be exclusively assigned to one branch. +There are broadly two ways to assign cells: + +1. Hard assignment (0 and 1), where each cell belongs to only one branch. +2. Soft assignment (0 to 1), where a cell can belong to multiple branches simultaneously. + +In Monocle3, each cell is associated with only one branch, implying a hard assignment, +which is suitable for `scMaSigPro` current version. On the other hand, `tradeSeq` can +handle both hard and soft assignments. In Slingshot, for instance, cells can be +part of multiple branches, which is a form of soft assignment. One key difference +is that Slingshot assigns different pseudotimes to each branch, whereas Monocle3 +uses a universal pseudotime for all branches. + +## Extracting Assignmnet + +### Load Object +```{r, echo=TRUE, eval=TRUE, "loading m3 cds temp"} +## Read CDS from the server +cds <- readRDS(url("https://www.metapriyansh.com/scMaSigPro/rds_objects/multifurcating_4_cds.RDS")) +``` + +```{r, echo=FALSE, eval=FALSE, "loading m3 cds"} +## Read CDS from the server +cds <- readRDS("multifurcating_4_cds.RDS") +``` + +### Extracting Branch Assignments +We will follow the steps from the [`tradeSeq` Vignette](https://statomics.github.io/tradeSeq/articles/Monocle.html#extracting-the-pseudotimes-and-cell-weights-for-tradeseq-1) +to extract the assignment of cells to branches. However, for the sake of simplicity, +we will tweak these steps to avoid using the magrittr and monocle3 packages. +Instead, we will use base R functions and directly access the S4 slots. +^[We extracted the vertex from UMAP and the trajectory was learned on tSNE. This is because our CDS object has the tSNE coordinates stored in the UMAP slot. This was done because `monocle3::learn_graph()` does not work with tSNE coordinates. You can read more about this issue on [monocle3-issues/242](https://github.com/cole-trapnell-lab/monocle3/issues/242)] + +```{r, echo=TRUE, eval=TRUE, "extracting assignment Root Nodes", message=FALSE} +# Get the closest vertices for every cell +y_to_cells <- as.data.frame(cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex) +y_to_cells$cells <- rownames(y_to_cells) +y_to_cells$Y <- y_to_cells$V1 + +## Root Nodes +root <- cds@principal_graph_aux$UMAP$root_pr_nodes + +## Extract MST (PQ Graph) +mst <- cds@principal_graph$UMAP + +## All end-points +endpoints <- names(which(igraph::degree(mst) == 1)) + +## Root is also an endpoint so we remove it +endpoints <- endpoints[!endpoints %in% root] + +## Extract +cellAssignments_list <- lapply(endpoints, function(endpoint) { + # We find the path between the endpoint and the root + path <- igraph::shortest_paths(mst, root, endpoint)$vpath[[1]] + path <- as.character(path) + # We find the cells that map along that path + df <- y_to_cells[y_to_cells$Y %in% path, ] + df <- data.frame(weights = as.numeric(colnames(cds@assays@data@listData$counts) %in% df$cells)) + colnames(df) <- endpoint + return(df) +}) + +## Format +cellAssignments <- do.call(what = "cbind", cellAssignments_list) +cellAssignments <- as.matrix(cellAssignments) + +# Update columns +rownames(cellAssignments) <- colnames(cds@assays@data@listData$counts) +head(cellAssignments[c(20:30), ]) +``` + +## `scMaSigPro` object +For this vignette, we will only consider three paths: the root, Y_18, and Y_15 +(Refer Figure-A). We will remove the other paths and any cells that are part of +those paths. +```{r, "Subsetting", eval=TRUE} +# Subset for 3 paths +cellAssignments <- cellAssignments[, c("Y_18", "Y_52", "Y_15"), drop = FALSE] + +# Remove any of the cells which is not assigned to any path +cellAssignments <- cellAssignments[rowSums(cellAssignments) != 0, ] + +# Create Cell Data +cellData <- data.frame( + cell_id = rownames(cellAssignments), + row.names = rownames(cellAssignments) +) + +# Extract counts +counts <- cds@assays@data@listData$counts + +# Subset counts +counts <- counts[, rownames(cellAssignments), drop = FALSE] +``` + +### Creating Cell Metadata +Another important step is to label the elements (Refer to Figure-B). If a cell +belongs to the root, then it is part of all the paths. If a cell is part of a +link branch, then it is part of both the "Y_15 and Y_18" branches. +```{r, "Creating Cell Metadata", eval=TRUE} +# Create Cell Metadata +cellData[["group"]] <- apply(cellAssignments, 1, FUN = function(x) { + npath <- length(names(x[x == 1])) + if (npath == 3) { + return("root") + } else if (npath == 2) { + return(paste(names(x[x == 1]), collapse = "_links_")) + } else { + return(names(x[x == 1])) + } +}) +table(cellData[["group"]]) + +# Extract from CDS +ptimes <- cds@principal_graph_aux$UMAP$pseudotime + +# Remove cells which are not assigned to any path +ptimes <- ptimes[rownames(cellAssignments)] + +# Assign to cellData +cellData[["m3_pseudotime"]] <- ptimes +``` + +### Creating Object +```{r, "Creating Object"} +# Create Object +multi_scmp_ob <- create_scmp( + counts = counts, + cell_data = cellData, + ptime_col = "m3_pseudotime", + path_col = "group", + use_as_bin = FALSE +) +multi_scmp_ob +``` + +# Each Branch as a Group (Approach-1) + +The first approach to using `scMaSigPro` to analyze branches with common cells +is to consider the common cells as a separate group. For the interpretation we +will evaluate whether a particular gene's expression in the common root cells and +it's downstream branch is similar. + +## Perform Binning +```{r, eval=T, "Each Branch as a group", fig.align='center', fig.width=8, fig.height=5, fig.alt='Each Branch as a group'} +## Pseudotime based binning +multi_scmp_ob_A <- sc.squeeze(multi_scmp_ob) + +## Plot bin information +plotBinTile(multi_scmp_ob_A) +``` + +In the above figure, we can see that `scMaSigPro` considers each group as a +separate branch. Running `scMaSigPro` with this configuration will lead to a +different interpretation of the data. This will help us evalute whether the +expression changes across branches compared to root. If you want to consider +the common root cells and branches together, you can follow the next section. + +Additionally, we can see that the common root cells and the resulting branches +have the same values of binned pseudotime values. This is because the binning +is performed independently over each branch. + +## Running Workflow +```{r, eval=T, "Each branch as a group workflow", fig.align='center', fig.width=8, fig.height=5, fig.alt='Upset Plot'} +# Polynomial Degree 2 +multi_scmp_ob_A <- sc.set.poly(multi_scmp_ob_A, poly_degree = 3) + +# Detect non-flat profiles +multi_scmp_ob_A <- sc.p.vector( + multi_scmp_ob_A, + verbose = FALSE +) + +# Model refinement +multi_scmp_ob_A <- sc.t.fit( + multi_scmp_ob_A, + verbose = FALSE +) + +# Apply filter +multi_scmp_ob_A <- sc.filter( + scmpObj = multi_scmp_ob_A, + rsq = 0.55, + vars = "groups", + intercept = "dummy", + includeInflu = TRUE +) + +# Plot upset +plotIntersect(multi_scmp_ob_A) +``` + +In the upset plot above, we can see that there are 294 genes which change among +branches and also change with pseudotime. We will explore them later. First, let's +look at the genes that have similar expression in the "root", "Y_18_links_Y_15vsroot" (link) and "Y_18" (branch). + +`scMaSigPro` treated the "root" as the reference for all branches. Let's extract +the intersection information from the `scMaSigPro` object. Starting from `scMaSigPro` +version `0.0.4`, the `plotIntersect(return = TRUE)` function can be used to +extract the intersection information from the object as a dataframe. + +## Significant Genes +```{r, "Extract Intersection", eval=T} +shared_genes <- plotIntersect(multi_scmp_ob_A, return = TRUE) +head(shared_genes) +``` + +```{r,"Sample Gene",fig.align='center', fig.width=8, fig.height=5, fig.alt='G42'} +## Similar expression in root, "Y_18_links_Y_15vsroot" and "Y_15" +gene_br_Y_15 <- rownames(shared_genes[shared_genes$Y_52vsroot == 1 & + shared_genes$Y_18vsroot == 1 & + shared_genes$Y_15vsroot == 0 & + shared_genes$root == 1 & + shared_genes$Y_18_links_Y_15vsroot == 0, ]) # "G42" +plotTrend(multi_scmp_ob_A, feature_id = "G42", points = F, lines = T) +``` + +In the above plot, we can see that the gene G42 has a similar expression in the +"root", "Y_18_links_Y_15vsroot," and "Y_15.". This is a good example of how +`scMaSigPro` can be used to identify genes that are showing similar patterns in the common root +cells and how different is it in other branches. The main idea is to set the root +as a reference and then compare the branches to the root. + +Next we can explore the 294 genes that we have in the intersection. As it is hard +to see them all at once, we will first perform clustering and then look at them +in groups. + +```{r, "root, Y_18_links_Y_15vsroot and Y_18", eval=T, fig.align='center', fig.width=8, fig.height=5, fig.alt='Clustering'} +## Perform Clustering +multi_scmp_ob_A <- scMaSigPro::sc.cluster.trend(multi_scmp_ob_A) + +# Plot Clusters +plotTrendCluster(multi_scmp_ob_A, verbose = FALSE, loess_span = 0.8) +``` + +# Ordering Common Cells (Approach-2) + +Another approach is to reorder the pseudotime bins, meaning the bins representing +the common root cells are ordered first, followed by the bins of the branches. +This helps compare the gene expression of the common root cells togther with the +downstream branches. + +## Reorder Bins Manually + +### Extract Binned Data +```{r} +binned_data <- cDense(multi_scmp_ob_A) +head(binned_data[, -2]) +``` + +### Create Bin Data for Root + Y_52 +The main idea is to update the values in the binned pseudotime column using an +offset. Based on our dataset, we observe that Y_52 is a branch that directly +originated from the Root. Therefore, the bins of the Y_52 branch should be aligned +right after it. This means that the offset for the Y_52 branch will be the maximum +value of the binned pseudotime of the root branch. + +```{r} +## Create a new df +Y_52_binned_data <- binned_data[binned_data$group %in% c("root", "Y_52"), , drop = FALSE] + +## Calculate Offset (Max Value of the root) +Y_52_offset <- max(Y_52_binned_data[Y_52_binned_data$group == "root", "scmp_binned_pseudotime"]) + +## Add offset +Y_52_binned_data[Y_52_binned_data$group == "Y_52", "scmp_binned_pseudotime"] <- Y_52_binned_data[Y_52_binned_data$group == "Y_52", "scmp_binned_pseudotime"] + Y_52_offset + +## Update bins and rownames +Y_52_binned_data$group2 <- "Y_52" +rownames(Y_52_binned_data) <- paste(Y_52_binned_data$group2, Y_52_binned_data$scmp_binned_pseudotime, sep = "_bin_") + +head(Y_52_binned_data[, -2]) +``` + +### Create Bin Data for Root + Link + Y_15|Y_18 +For the other branches we will add an offset of both "link" and "Y_15|Y_18". +```{r, fig.align='center', fig.width=8, fig.height=5, fig.alt='Manual Bin'} +## Create a new df +Y_18_binned_data <- binned_data[binned_data$group %in% c("root", "Y_18_links_Y_15", "Y_18"), , drop = FALSE] +Y_15_binned_data <- binned_data[binned_data$group %in% c("root", "Y_18_links_Y_15", "Y_15"), , drop = FALSE] + +## Calculate Offset (Max Value of the root and Link) +Y_18_root_offset <- max(Y_18_binned_data[Y_18_binned_data$group == "root", "scmp_binned_pseudotime"]) +Y_18_link_offset <- max(Y_18_binned_data[Y_18_binned_data$group == "Y_18_links_Y_15", "scmp_binned_pseudotime"]) + +## Add link offset to branch Y_18 +Y_18_binned_data[Y_18_binned_data$group == "Y_18", "scmp_binned_pseudotime"] <- Y_18_binned_data[Y_18_binned_data$group == "Y_18", "scmp_binned_pseudotime"] + Y_18_link_offset +Y_15_binned_data[Y_15_binned_data$group == "Y_15", "scmp_binned_pseudotime"] <- Y_15_binned_data[Y_15_binned_data$group == "Y_15", "scmp_binned_pseudotime"] + Y_18_link_offset + +## Add root offset to (link+ branch) +Y_18_binned_data[Y_18_binned_data$group != "root", "scmp_binned_pseudotime"] <- Y_18_binned_data[Y_18_binned_data$group != "root", "scmp_binned_pseudotime"] + Y_18_root_offset +Y_15_binned_data[Y_15_binned_data$group != "root", "scmp_binned_pseudotime"] <- Y_15_binned_data[Y_15_binned_data$group != "root", "scmp_binned_pseudotime"] + Y_18_root_offset + +## Update bins and rownames +Y_18_binned_data$group2 <- "Y_18" +rownames(Y_18_binned_data) <- paste(Y_18_binned_data$group2, Y_18_binned_data$scmp_binned_pseudotime, sep = "_bin_") +Y_15_binned_data$group2 <- "Y_15" +rownames(Y_15_binned_data) <- paste(Y_15_binned_data$group2, Y_15_binned_data$scmp_binned_pseudotime, sep = "_bin_") + +# Create plot +ggplot() + + geom_tile( + data = Y_18_binned_data, + aes(x = scmp_binned_pseudotime, y = group), fill = "blue", alpha = 0.2 + ) + + geom_tile( + data = Y_18_binned_data, + aes(x = scmp_binned_pseudotime, y = group2), fill = "red", alpha = 0.2 + ) + + theme_minimal() +``` + +As we can see above, the tiles in red represent the original bins, while the blue +represents the new bins "Y_18". At the end, we also observe the overlap where the +original bins of branch Y_18 are shifted to the end. + +## Pseudobulk +```{r , fig.align='center', fig.width=8, fig.height=5, fig.alt='Manual Bins'} +## Combine all metadata +new_binned_data <- rbind(Y_52_binned_data, Y_18_binned_data, Y_15_binned_data) + +## Create a new object +multi_scmp_ob_A_manual <- multi_scmp_ob_A + +## Update dense Metadata slot +cDense(multi_scmp_ob_A_manual) <- new_binned_data + +## Set the group used for binning +multi_scmp_ob_A_manual@Parameters@path_col <- "group2" + +## Perform Aggregation +multi_scmp_ob_A_manual <- pb_counts( + scmpObj = multi_scmp_ob_A_manual, + assay_name = "counts", +) + +## Plot +plotBinTile(multi_scmp_ob_A_manual) +``` + +From `scMaSigPro` version `0.0.4` onwards, a wrapper function was introduced +to assist in ordering the bins. The working of the wrapper function `sc.restruct()` +is shown below, and it works on fairly simplified datasets like the one demonstrated +in this vignette. + +## Reorder Bins with `sc.restruct()` +```{r, "Reconstruct", fig.align='center', fig.width=8, fig.height=5, fig.alt='Reconstructed Path Bins'} +## Showcase sc.restruct wrapper +multi_scmp_ob_B <- sc.restruct(multi_scmp_ob_A, + end_node_list = list("Y_15", "Y_18", "Y_52"), + root_node = "root", link_node_list = list("Y_18_links_Y_15"), + verbose = FALSE, link_sep = "_links_", assay_name = "counts", aggregate = "sum" +) +# Show the new path +plotBinTile(multi_scmp_ob_B) +``` + +As we see in the plot above, the `sc.restruct()` wrapper function performs the +exact transformations internally. For future updates (CRAN version), we will offer +a Shiny-based solution that can effectively handle more complicated datasets. +However, as of `scMaSigPro` version `0.0.4`, users are required to either use +`sc.restruct()` or manually order the datasets. + +## Running Workflow +```{r, eval=T,fig.align='center', fig.width=8, fig.height=5} +# Polynomial Degree 2 +multi_scmp_ob_B <- sc.set.poly(multi_scmp_ob_B, poly_degree = 3) + +# Detect non-flat profiles +multi_scmp_ob_B <- sc.p.vector( + multi_scmp_ob_B, + verbose = FALSE +) + +# Model refinement +multi_scmp_ob_B <- sc.t.fit( + multi_scmp_ob_B, + verbose = FALSE +) + +# Apply filter +multi_scmp_ob_B <- sc.filter( + scmpObj = multi_scmp_ob_B, + rsq = 0.5, + vars = "groups", + intercept = "dummy", + includeInflu = TRUE +) + +# Plot upset +plotIntersect(multi_scmp_ob_B) +``` + +Now, in the above scenario, Y_15 is treated as the reference, and we will look +at the genes that are expressed differently compared to this branch. This approach +will help us understand how genes are changing across the branches while considering +the root cells for each of the downstream branches. + +## Significant Genes +```{r, fig.align='center', fig.width=8, fig.height=5, fig.alt='Clusters of ordered data'} +## Perform Clustering +multi_scmp_ob_B <- sc.cluster.trend(multi_scmp_ob_B, + k = 4, cluster_method = "kmeans" +) + +## Plot Clusters +plotTrendCluster(multi_scmp_ob_B, verbose = FALSE) +``` + +--- + +### Session Info +```{r, "Session Info"} +sessionInfo(package = "scMaSigPro") +``` diff --git a/vignettes/scMaSigPro-Class.Rmd b/vignettes/scMaSigPro-Class.Rmd index 40ce57a..cb2f77d 100644 --- a/vignettes/scMaSigPro-Class.Rmd +++ b/vignettes/scMaSigPro-Class.Rmd @@ -1,5 +1,5 @@ --- -title: "scMaSigPro: scMaSigPro Class" +title: "scMaSigProClass" subtitle: "S4 Object, Generics and Queries" author: "Priyansh Srivastava" package: "scMaSigPro" @@ -11,7 +11,7 @@ abstract: | the S4 Object of the `scMaSigPro-Class` and generic methods. output: BiocStyle::html_document vignette: > - %\VignetteIndexEntry{scMaSigPro Class} + %\VignetteIndexEntry{scMaSigProClass} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -20,7 +20,6 @@ vignette: > knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(crop = NULL) library(scMaSigPro) -library(patchwork) ``` ## Introduction @@ -32,9 +31,23 @@ to pinpoint genes exhibiting significant expression profile differences among branching paths and pseudotime. ## Installation + Currently, `scMaSigPro` is available on GitHub and can be installed as follows: -```{r, echo=TRUE, eval=FALSE} +### Bioconductor and Dependencies +```{r, eval=FALSE, echo=TRUE} +# Install Dependencies +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +BiocManager::install(version = "3.14") + +BiocManager::install(c("SingleCellExperiment", "maSigPro", "MatrixGenerics", "S4Vectors")) +``` + +### scMaSigPro latest version +To install `scMaSigPro` from GitHub, use the following R code: +```{r, eval=FALSE, echo=TRUE} # Install devtools if not already installed if (!requireNamespace("devtools", quietly = TRUE)) { install.packages("devtools") @@ -47,7 +60,7 @@ devtools::install_github("BioBam/scMaSigPro", build_manual = TRUE, upgrade = "never", force = TRUE, - quiet = TRUE, + quiet = TRUE ) ``` diff --git a/vignettes/scMaSigPro-maSigPro.Rmd b/vignettes/scMaSigPro-maSigPro.Rmd index 3929ca0..7c50faf 100644 --- a/vignettes/scMaSigPro-maSigPro.Rmd +++ b/vignettes/scMaSigPro-maSigPro.Rmd @@ -1,5 +1,5 @@ --- -title: "scMaSigPro: Bridging maSigPro Analysis" +title: "Bridging maSigPro Analysis" subtitle: "Guide to applying maSigPro workflow with scMaSigPro" author: "Priyansh Srivastava" package: "scMaSigPro" @@ -11,7 +11,7 @@ abstract: | the workflow of the original [`maSigPro`](https://www.bioconductor.org/packages/release/bioc/html/maSigPro.html) Bioconductor package of using `scMaSigPro`. output: BiocStyle::html_document vignette: > - %\VignetteIndexEntry{MaSigPro workflow} + %\VignetteIndexEntry{Bridging maSigPro Analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -51,11 +51,24 @@ in modeling gene expression variability, a key factor in identifying significant changes in gene expression, thereby marking a significant step forward in the application of `maSigPro` to the more complex RNA-Seq data. -## Setup and Installation -### Installing `scMaSigPro` -`scMaSigPro` is currently available on GitHub. Follow these steps to install it: +## Installation -```{r, "scMaSigPro-install", echo=TRUE, eval=FALSE} +Currently, `scMaSigPro` is available on GitHub and can be installed as follows: + +### Bioconductor and Dependencies +```{r, eval=FALSE, echo=TRUE} +# Install Dependencies +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +BiocManager::install(version = "3.14") + +BiocManager::install(c("SingleCellExperiment", "maSigPro", "MatrixGenerics", "S4Vectors")) +``` + +### scMaSigPro latest version +To install `scMaSigPro` from GitHub, use the following R code: +```{r, eval=FALSE, echo=TRUE} # Install devtools if not already installed if (!requireNamespace("devtools", quietly = TRUE)) { install.packages("devtools") @@ -68,7 +81,7 @@ devtools::install_github("BioBam/scMaSigPro", build_manual = TRUE, upgrade = "never", force = TRUE, - quiet = TRUE, + quiet = TRUE ) ``` @@ -338,7 +351,7 @@ Having confirmed that the numerical values are identical for both methods, our next step is to assess whether the actual trends are also consistent. ### Visualizing Intersection `maSigPro::suma2Venn()` -```{r, "upset and venn"} +```{r, "upset and venn", warning=FALSE} # Venn Diagram of maSigPro suma2Venn(sigs$summary[, c(1:4)]) @@ -357,7 +370,7 @@ expression from the same experimental group is represented in the same color, an lines are drawn to join the averages of each time-group to visualize the trend of each group over time. For `scMaSigPro`, this functionality is achieved using `plotTrend()`. -```{r, "STMDE66"} +```{r, "STMDE66",warning=FALSE} # Extracting gene "STMDE66" from data STMDE66 <- data.abiotic[rownames(data.abiotic) == "STMDE66", ] @@ -371,12 +384,13 @@ PlotGroups(STMDE66, plotTrend(scmp_ob, "STMDE66", logs = FALSE, pseudoCount = 0, smoothness = 0.01, significant = FALSE, - summary_mode = "mean" + summary_mode = "mean", + curves = TRUE, lines = TRUE, points = TRUE ) ``` ### Visualizing Cluster trend with `maSigPro::PlotProfiles()` -```{r, "Cluster Plots", fig.width=7} +```{r, "Cluster Plots", fig.width=7, warning=FALSE} # Plot clustered Trend gc <- capture_output( res <- see.genes(sigs$sig.genes$ColdvsControl,