diff --git a/DESCRIPTION b/DESCRIPTION index 3b860c5..b13f612 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,7 @@ Imports: WGCNA, limma, mvtnorm, - phyloseq , + phyloseq, utils biocViews: GeneExpression, RNASeq, SingleCell, Sequencing, DNASeq RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index 8d825f4..ed352fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,7 @@ export(SPsimSeq) export(configExperiment) -export(dSPsimSeq) +export(evaluateDensities) import(methods) importFrom(Hmisc,cut2) importFrom(SingleCellExperiment,SingleCellExperiment) diff --git a/R/SPsimSeq.R b/R/SPsimSeq.R index fb1fb11..e34e71f 100644 --- a/R/SPsimSeq.R +++ b/R/SPsimSeq.R @@ -83,7 +83,7 @@ #' data("zhang.data.sub") #' #' # filter genes with sufficient expression (important step to avoid bugs) -#' zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ] +#' zhang.counts <- zhang.data.sub$counts #' MYCN.status <- zhang.data.sub$MYCN.status #' #' # We simulate only a single data (n.sim = 1) with the following property @@ -100,13 +100,11 @@ #' #' # simulate data #' set.seed(6452) -#' zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ] #' sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, #' group = MYCN.status, n.genes = 2000, batch.config = 1, #' group.config = c(0.5, 0.5), tot.samples = 20, #' pDE = 0.1, lfc.thrld = 0.5, result.format = "list") #' -#' #' head(sim.data.bulk$counts[[1]][, seq_len(5)]) # count data #' head(sim.data.bulk$colData) # sample info #' head(sim.data.bulk$rowData) # gene info @@ -131,14 +129,13 @@ #' data("scNGP.data") #' #' # filter genes with sufficient expression (important step to avoid bugs) -#' scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ] -#' treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1) +#' treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1) #' #' set.seed(654321) -#' scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 2000), ] +#' scNGP.data <- scNGP.data[sample(nrow(scNGP.data), 2000), ] #' #' # simulate data (we simulate here only a single data, n.sim = 1) -#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2, group = treatment, +#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data, group = treatment, #' n.genes = 2000, batch.config = 1, #' group.config = c(0.5, 0.5), tot.samples = 100, #' pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE, diff --git a/R/configExperiment.R b/R/configExperiment.R index 114bb9d..3feb295 100644 --- a/R/configExperiment.R +++ b/R/configExperiment.R @@ -12,7 +12,6 @@ #' and the experiment configurartion #' @export #' @examples -#' \donttest{ #' batch = sample(LETTERS[1:3], 20, replace = TRUE) #' group = sample(1:3, 20, replace = TRUE) #' #---- a design with a total of 10 samples/cells from 1 batch and 1 group @@ -34,7 +33,6 @@ #' # respectively. #' configExperiment(batch.config = c(5/30, 10/30, 15/30), group.config = c(0.5, 0.5), #' tot.samples = 30, batch = batch, group = group) -#' } configExperiment <- function(batch.config, group.config, tot.samples, batch, group){ #Sort, such that largest groups and largest batches match with simulation batch.config = sort(batch.config, decreasing = TRUE) diff --git a/R/dSPsimSeq.R b/R/dSPsimSeq.R deleted file mode 100644 index 4ad6179..0000000 --- a/R/dSPsimSeq.R +++ /dev/null @@ -1,45 +0,0 @@ -#' Evaluate the densities in the estimated SPsimSeq object -#' @param SPobj The SPsimSeq object, with details retained -#' @param newData A character vector of gene names -#' @return a list of estimated densities, breaks and midpoints, one for every -#' gene in newData -#' @export -#' @examples -#' data("zhang.data.sub") -#' # filter genes with sufficient expression (important step to avoid bugs) -#' zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ] -#' MYCN.status <- zhang.data.sub$MYCN.status -#' # simulate data -#' zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ] -#' sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, -#' group = MYCN.status, n.genes = 2000, batch.config = 1, -#' group.config = c(0.5, 0.5), tot.samples = 20, -#' pDE = 0.1, lfc.thrld = 0.5, result.format = "list", -#' return.details = TRUE) -#' outDens = dSPsimSeq(sim.data.bulk) -#' select.genes <- sample(names(outDens), 4) -#' select.sample = sample( -#' seq_along(sim.data.bulk$detailed.results$exprmt.design$sub.groups), 1) -#' par(mfrow=c(2, 2)) -#' for(i in select.genes){ -#' plot(outDens[[i]][[select.sample]]$mids, outDens[[i]][[select.sample]]$gy, type = "l", -#' xlab = "Outcome", ylab = "Density", main = paste("Gene", i)) -#' } -dSPsimSeq <- function(SPobj, newData = names(SPobj$detailed.results$densList)){ - if(!"detailed.results" %in% names(SPobj)){ - stop("Estimated densities needed, try running SPsimSeq with - return.details = TRUE") - } - if(!is.character(newData)){ - stop("Provide a character vector of genes") - } - dets = SPobj$detailed.results #details - names(newData) = newData - #Construct the appropriate densities - lapply(newData, function(gene){ - constructDens(returnDens = TRUE, - DE.ind.ii = gene %in% dets$cand.DE.genes$nonnull.genes, - exprmt.design = dets$exprmt.design, - densList.ii = dets$densList[[gene]]) - }) -} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 62b0e18..5de47d6 100644 --- a/README.Rmd +++ b/README.Rmd @@ -21,7 +21,7 @@ BioConductor installation ```{r biocinstall, eval = FALSE} library(BiocManager) -BiocManager::install("SPsimSeq", update = FALSE) +BiocManager::install("SPsimSeq") ``` ```{r loadspsimseqpackage} diff --git a/README.md b/README.md index f383ece..b40781d 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,6 @@ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, ## Estimating featurewise correlations ... - ## - ## Selecting candidate DE genes ... ## Note: The number of DE genes detected in the source data is 79 and the number of DE genes required to be included in the simulated data is 200. Therefore, candidiate DE genes are sampled with replacement. diff --git a/data/scNGP.data.rda.REMOVED.git-id b/data/scNGP.data.rda.REMOVED.git-id index b751cd2..0cb6220 100644 --- a/data/scNGP.data.rda.REMOVED.git-id +++ b/data/scNGP.data.rda.REMOVED.git-id @@ -1 +1 @@ -a4a3ab7f514bfc91ae71bd58dfb04b4529885753 \ No newline at end of file +4bd374a6e2e00cc4b65d0d8f85ce43ad41f2fdce \ No newline at end of file diff --git a/data/zhang.data.sub.rda b/data/zhang.data.sub.rda new file mode 100644 index 0000000..e2d3049 Binary files /dev/null and b/data/zhang.data.sub.rda differ diff --git a/data/zhang.data.sub.rda.REMOVED.git-id b/data/zhang.data.sub.rda.REMOVED.git-id deleted file mode 100644 index 007ef12..0000000 --- a/data/zhang.data.sub.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -4112deec1acaf57235ca5cc301a1934f8fcf0408 \ No newline at end of file diff --git a/man/SPsimSeq.Rd b/man/SPsimSeq.Rd index e151a83..3a78249 100644 --- a/man/SPsimSeq.Rd +++ b/man/SPsimSeq.Rd @@ -121,7 +121,7 @@ estimate the probability distributions. data("zhang.data.sub") # filter genes with sufficient expression (important step to avoid bugs) -zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ] +zhang.counts <- zhang.data.sub$counts MYCN.status <- zhang.data.sub$MYCN.status # We simulate only a single data (n.sim = 1) with the following property @@ -138,13 +138,11 @@ MYCN.status <- zhang.data.sub$MYCN.status # simulate data set.seed(6452) -zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ] sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, group = MYCN.status, n.genes = 2000, batch.config = 1, group.config = c(0.5, 0.5), tot.samples = 20, pDE = 0.1, lfc.thrld = 0.5, result.format = "list") - head(sim.data.bulk$counts[[1]][, seq_len(5)]) # count data head(sim.data.bulk$colData) # sample info head(sim.data.bulk$rowData) # gene info @@ -169,14 +167,13 @@ library(SingleCellExperiment) data("scNGP.data") # filter genes with sufficient expression (important step to avoid bugs) -scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ] -treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1) +treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1) set.seed(654321) -scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 2000), ] +scNGP.data <- scNGP.data[sample(nrow(scNGP.data), 2000), ] # simulate data (we simulate here only a single data, n.sim = 1) -sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2, group = treatment, +sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data, group = treatment, n.genes = 2000, batch.config = 1, group.config = c(0.5, 0.5), tot.samples = 100, pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE, diff --git a/man/configExperiment.Rd b/man/configExperiment.Rd index ddea043..cbbccf4 100644 --- a/man/configExperiment.Rd +++ b/man/configExperiment.Rd @@ -26,7 +26,6 @@ and the experiment configurartion Configure experiment } \examples{ -\donttest{ batch = sample(LETTERS[1:3], 20, replace = TRUE) group = sample(1:3, 20, replace = TRUE) #---- a design with a total of 10 samples/cells from 1 batch and 1 group @@ -49,4 +48,3 @@ batch = batch, group = group) configExperiment(batch.config = c(5/30, 10/30, 15/30), group.config = c(0.5, 0.5), tot.samples = 30, batch = batch, group = group) } -} diff --git a/man/dSPsimSeq.Rd b/man/dSPsimSeq.Rd deleted file mode 100644 index 2815556..0000000 --- a/man/dSPsimSeq.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dSPsimSeq.R -\name{dSPsimSeq} -\alias{dSPsimSeq} -\title{Evaluate the densities in the estimated SPsimSeq object} -\usage{ -dSPsimSeq(SPobj, newData = names(SPobj$detailed.results$densList)) -} -\arguments{ -\item{SPobj}{The SPsimSeq object, with details retained} - -\item{newData}{A character vector of gene names} -} -\value{ -a list of estimated densities, breaks and midpoints, one for every -gene in newData -} -\description{ -Evaluate the densities in the estimated SPsimSeq object -} -\examples{ -data("zhang.data.sub") -# filter genes with sufficient expression (important step to avoid bugs) -zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ] -MYCN.status <- zhang.data.sub$MYCN.status -# simulate data -zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ] -sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, - group = MYCN.status, n.genes = 2000, batch.config = 1, - group.config = c(0.5, 0.5), tot.samples = 20, - pDE = 0.1, lfc.thrld = 0.5, result.format = "list", - return.details = TRUE) -outDens = dSPsimSeq(sim.data.bulk) -select.genes <- sample(names(outDens), 4) -select.sample = sample( -seq_along(sim.data.bulk$detailed.results$exprmt.design$sub.groups), 1) -par(mfrow=c(2, 2)) -for(i in select.genes){ - plot(outDens[[i]][[select.sample]]$mids, outDens[[i]][[select.sample]]$gy, type = "l", - xlab = "Outcome", ylab = "Density", main = paste("Gene", i)) - } -} diff --git a/vignettes/SPsimSeq.Rmd b/vignettes/SPsimSeq.Rmd index a627adc..df723bb 100644 --- a/vignettes/SPsimSeq.Rmd +++ b/vignettes/SPsimSeq.Rmd @@ -1,4 +1,7 @@ --- +title: "SPsimSeq: semi-parametric simulation for bulk and single cell RNA-seq data" +author: "Alemu Takele Assefa and Stijn Hawinkel" +date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true @@ -51,7 +54,6 @@ library(SPsimSeq) MYCN.status <- zhang.data.sub$MYCN.status set.seed(6452) - zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 3000), ] # We simulate only a single data (n.sim = 1) with the following property # - 3000 genes ( n.genes = 3000) # - 172 samples (tot.samples = 172) -- equal to the source data @@ -65,7 +67,7 @@ library(SPsimSeq) # estimation (model.zero.prob = FALSE) # simulate data - sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, + sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts, group = MYCN.status, n.genes = 3000, batch.config = 1, group.config = c(0.5, 0.5), tot.samples = 172, pDE = 0.1, lfc.thrld = 0.5, result.format = "list") @@ -181,14 +183,11 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0. data("scNGP.data") # filter genes with sufficient expression (important step to avoid bugs) - scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ] - treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1) + treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1) set.seed(654321) - scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 5000), ] - # simulate data (we simulate here only a single data, n.sim = 1) - sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2, + sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data, group = treatment, n.genes = 5000, batch.config = 1, group.config = c(0.5, 0.5), tot.samples = 100, pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE, @@ -205,7 +204,7 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0. # and fraction of zero counts per gene # normalize counts for comparison -Y0.log.cpm <- log2(edgeR::cpm(counts(scNGP.data2))+1) +Y0.log.cpm <- log2(edgeR::cpm(counts(scNGP.data))+1) Y1.log.cpm <- log2(edgeR::cpm(counts(sim.data.sc1))+1) Y0.log.cpm <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ] Y1.log.cpm <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ] @@ -216,7 +215,7 @@ rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TR rowZeroFrac <- function(X){apply(X, 1, function(x) mean(x==0, na.rm=TRUE))} par(mfrow=c(1, 3)) -boxplot(list(real.data=colSums(counts(scNGP.data2)), +boxplot(list(real.data=colSums(counts(scNGP.data)), simulated.data=colData(sim.data.sc1)$sim.Lib.Size), main="library size") boxplot(list(real.data=rowMeans(Y0.log.cpm), @@ -324,4 +323,4 @@ sessionInfo() # References -References missing? +\printbibliography