Reducing data size

Former-commit-id: 95c169c Former-commit-id: ac586a15c9ab547b03c782d2fec6fb73615e4421 Former-commit-id: f02bb2f3ed0917a5cb97e8eb1f650064a3f729cc Former-commit-id: a0aa2f8 Former-commit-id: cbbaf371263a0cd187be254b1140dbd0a2e32522 Former-commit-id: 714267e
CenterForStatistics-UGent · Mar 27, 2020 · 864f8d0 · 864f8d0
1 parent ff684a9
commit 864f8d0
Show file tree

Hide file tree

Showing 14 changed files with 21 additions and 122 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -39,7 +39,7 @@ Imports:
          WGCNA, 
          limma, 
          mvtnorm,
-         phyloseq ,
+         phyloseq,
          utils
 biocViews: GeneExpression, RNASeq, SingleCell, Sequencing, DNASeq
 RoxygenNote: 6.1.1

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,7 @@
 
 export(SPsimSeq)
 export(configExperiment)
-export(dSPsimSeq)
+export(evaluateDensities)
 import(methods)
 importFrom(Hmisc,cut2)
 importFrom(SingleCellExperiment,SingleCellExperiment)

diff --git a/R/SPsimSeq.R b/R/SPsimSeq.R
@@ -83,7 +83,7 @@
 #' data("zhang.data.sub")
 #'
 #' # filter genes with sufficient expression (important step to avoid bugs)
-#' zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ]
+#' zhang.counts <- zhang.data.sub$counts
 #' MYCN.status  <- zhang.data.sub$MYCN.status
 #'
 #' # We simulate only a single data (n.sim = 1) with the following property
@@ -100,13 +100,11 @@
 #'
 #' # simulate data
 #' set.seed(6452)
-#' zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ]
 #' sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
 #'                           group = MYCN.status, n.genes = 2000, batch.config = 1,
 #'                           group.config = c(0.5, 0.5), tot.samples = 20,
 #'                           pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
 #'
-#'
 #' head(sim.data.bulk$counts[[1]][, seq_len(5)])  # count data
 #' head(sim.data.bulk$colData)        # sample info
 #' head(sim.data.bulk$rowData)        # gene info
@@ -131,14 +129,13 @@
 #' data("scNGP.data")
 #'
 #' # filter genes with sufficient expression (important step to avoid bugs)
-#' scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]
-#' treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
+#' treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
 #'
 #' set.seed(654321)
-#' scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 2000), ]
+#' scNGP.data <- scNGP.data[sample(nrow(scNGP.data), 2000), ]
 #'
 #' # simulate data (we simulate here only a single data, n.sim = 1)
-#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2, group = treatment,
+#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data, group = treatment,
 #'  n.genes = 2000, batch.config = 1,
 #'                       group.config = c(0.5, 0.5), tot.samples = 100,
 #'                      pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,

diff --git a/R/configExperiment.R b/R/configExperiment.R
@@ -12,7 +12,6 @@
 #' and the experiment configurartion 
 #' @export
 #' @examples 
-#' \donttest{
 #' batch = sample(LETTERS[1:3], 20, replace = TRUE)
 #' group = sample(1:3, 20, replace = TRUE)
 #' #---- a design with a total of 10 samples/cells from 1 batch and 1 group
@@ -34,7 +33,6 @@
 #' # respectively.
 #' configExperiment(batch.config = c(5/30, 10/30, 15/30), group.config = c(0.5, 0.5),
 #'  tot.samples = 30, batch = batch, group = group)
-#' }
 configExperiment <- function(batch.config, group.config, tot.samples, batch, group){
   #Sort, such that largest groups and largest batches match with simulation
   batch.config = sort(batch.config, decreasing = TRUE)

diff --git a/R/dSPsimSeq.R b/R/dSPsimSeq.R
diff --git a/README.Rmd b/README.Rmd
@@ -21,7 +21,7 @@ BioConductor installation
 
 ```{r biocinstall, eval = FALSE}
 library(BiocManager)
-BiocManager::install("SPsimSeq", update = FALSE)
+BiocManager::install("SPsimSeq")
 ```
 
 ```{r loadspsimseqpackage}

diff --git a/README.md b/README.md
@@ -68,8 +68,6 @@ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
 
     ## Estimating featurewise correlations ...
 
-    ## 
-
     ## Selecting candidate DE genes ...
 
     ## Note: The number of DE genes detected in the source data is 79 and the number of DE genes required to be included in the simulated data is 200. Therefore, candidiate DE genes are sampled with replacement.

diff --git a/data/scNGP.data.rda.REMOVED.git-id b/data/scNGP.data.rda.REMOVED.git-id
@@ -1 +1 @@
-a4a3ab7f514bfc91ae71bd58dfb04b4529885753
+4bd374a6e2e00cc4b65d0d8f85ce43ad41f2fdce
diff --git a/data/zhang.data.sub.rda b/data/zhang.data.sub.rda
diff --git a/data/zhang.data.sub.rda.REMOVED.git-id b/data/zhang.data.sub.rda.REMOVED.git-id
diff --git a/man/SPsimSeq.Rd b/man/SPsimSeq.Rd
diff --git a/man/configExperiment.Rd b/man/configExperiment.Rd
diff --git a/man/dSPsimSeq.Rd b/man/dSPsimSeq.Rd
diff --git a/vignettes/SPsimSeq.Rmd b/vignettes/SPsimSeq.Rmd
@@ -1,4 +1,7 @@
 ---
+title: "SPsimSeq: semi-parametric simulation for bulk and single cell RNA-seq data"
+author: "Alemu Takele Assefa and Stijn Hawinkel"
+date: "`r Sys.Date()`"
 output: 
   rmarkdown::html_vignette:
     toc: true
@@ -51,7 +54,6 @@ library(SPsimSeq)
  MYCN.status  <- zhang.data.sub$MYCN.status 
  
  set.seed(6452)
- zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 3000), ]
  # We simulate only a single data (n.sim = 1) with the following property
  # - 3000 genes ( n.genes = 3000) 
  # - 172 samples (tot.samples = 172) -- equal to the source data
@@ -65,7 +67,7 @@ library(SPsimSeq)
  #    estimation (model.zero.prob = FALSE)
  
  # simulate data
- sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
+ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts,
                           group = MYCN.status, n.genes = 3000, batch.config = 1,
                           group.config = c(0.5, 0.5), tot.samples = 172, 
                           pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
@@ -181,14 +183,11 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
   data("scNGP.data")
  
  # filter genes with sufficient expression (important step to avoid bugs) 
- scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]  
- treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
+ treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
  
  set.seed(654321)
- scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 5000), ]
- 
  # simulate data (we simulate here only a single data, n.sim = 1)
- sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2,
+ sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data,
                          group = treatment, n.genes = 5000, batch.config = 1,
                          group.config = c(0.5, 0.5), tot.samples = 100, 
                          pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
@@ -205,7 +204,7 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
  # and fraction of zero counts per gene
  
 # normalize counts for comparison  
-Y0.log.cpm  <- log2(edgeR::cpm(counts(scNGP.data2))+1)
+Y0.log.cpm  <- log2(edgeR::cpm(counts(scNGP.data))+1)
 Y1.log.cpm  <- log2(edgeR::cpm(counts(sim.data.sc1))+1)
 Y0.log.cpm  <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
 Y1.log.cpm  <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
@@ -216,7 +215,7 @@ rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TR
 rowZeroFrac <- function(X){apply(X, 1, function(x) mean(x==0, na.rm=TRUE))}
  
 par(mfrow=c(1, 3))
-boxplot(list(real.data=colSums(counts(scNGP.data2)), 
+boxplot(list(real.data=colSums(counts(scNGP.data)), 
              simulated.data=colData(sim.data.sc1)$sim.Lib.Size), 
         main="library size") 
 boxplot(list(real.data=rowMeans(Y0.log.cpm), 
@@ -324,4 +323,4 @@ sessionInfo()
 
 # References
 
-References missing?
+\printbibliography
-Original file line number
+Diff line change
@@ Expand Up / @@ -68,8 +68,6 @@ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, @@
         ## Estimating featurewise correlations ...
-        ##
         ## Selecting candidate DE genes ...
         ## Note: The number of DE genes detected in the source data is 79 and the number of DE genes required to be included in the simulated data is 200. Therefore, candidiate DE genes are sampled with replacement.
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		a4a3ab7f514bfc91ae71bd58dfb04b4529885753
		4bd374a6e2e00cc4b65d0d8f85ce43ad41f2fdce