Reducing data size

CenterForStatistics-UGent · Mar 27, 2020 · 95c169c · 95c169c
1 parent 14eb5cf
commit 95c169c
Show file tree

Hide file tree

Showing 13 changed files with 20 additions and 120 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -39,7 +39,7 @@ Imports:
          WGCNA, 
          limma, 
          mvtnorm,
-         phyloseq ,
+         phyloseq,
          utils
 biocViews: GeneExpression, RNASeq, SingleCell, Sequencing, DNASeq
 RoxygenNote: 6.1.1

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,7 @@
 
 export(SPsimSeq)
 export(configExperiment)
-export(dSPsimSeq)
+export(evaluateDensities)
 import(methods)
 importFrom(Hmisc,cut2)
 importFrom(SingleCellExperiment,SingleCellExperiment)

diff --git a/R/SPsimSeq.R b/R/SPsimSeq.R
@@ -83,7 +83,7 @@
 #' data("zhang.data.sub")
 #'
 #' # filter genes with sufficient expression (important step to avoid bugs)
-#' zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ]
+#' zhang.counts <- zhang.data.sub$counts
 #' MYCN.status  <- zhang.data.sub$MYCN.status
 #'
 #' # We simulate only a single data (n.sim = 1) with the following property
@@ -100,13 +100,11 @@
 #'
 #' # simulate data
 #' set.seed(6452)
-#' zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ]
 #' sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
 #'                           group = MYCN.status, n.genes = 2000, batch.config = 1,
 #'                           group.config = c(0.5, 0.5), tot.samples = 20,
 #'                           pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
 #'
-#'
 #' head(sim.data.bulk$counts[[1]][, seq_len(5)])  # count data
 #' head(sim.data.bulk$colData)        # sample info
 #' head(sim.data.bulk$rowData)        # gene info
@@ -131,14 +129,13 @@
 #' data("scNGP.data")
 #'
 #' # filter genes with sufficient expression (important step to avoid bugs)
-#' scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]
-#' treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
+#' treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
 #'
 #' set.seed(654321)
-#' scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 2000), ]
+#' scNGP.data <- scNGP.data[sample(nrow(scNGP.data), 2000), ]
 #'
 #' # simulate data (we simulate here only a single data, n.sim = 1)
-#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2, group = treatment,
+#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data, group = treatment,
 #'  n.genes = 2000, batch.config = 1,
 #'                       group.config = c(0.5, 0.5), tot.samples = 100,
 #'                      pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,

diff --git a/R/configExperiment.R b/R/configExperiment.R
@@ -12,7 +12,6 @@
 #' and the experiment configurartion 
 #' @export
 #' @examples 
-#' \donttest{
 #' batch = sample(LETTERS[1:3], 20, replace = TRUE)
 #' group = sample(1:3, 20, replace = TRUE)
 #' #---- a design with a total of 10 samples/cells from 1 batch and 1 group
@@ -34,7 +33,6 @@
 #' # respectively.
 #' configExperiment(batch.config = c(5/30, 10/30, 15/30), group.config = c(0.5, 0.5),
 #'  tot.samples = 30, batch = batch, group = group)
-#' }
 configExperiment <- function(batch.config, group.config, tot.samples, batch, group){
   #Sort, such that largest groups and largest batches match with simulation
   batch.config = sort(batch.config, decreasing = TRUE)

diff --git a/R/dSPsimSeq.R b/R/dSPsimSeq.R
diff --git a/README.Rmd b/README.Rmd
@@ -21,7 +21,7 @@ BioConductor installation
 
 ```{r biocinstall, eval = FALSE}
 library(BiocManager)
-BiocManager::install("SPsimSeq", update = FALSE)
+BiocManager::install("SPsimSeq")
 ```
 
 ```{r loadspsimseqpackage}

diff --git a/README.md b/README.md
@@ -68,8 +68,6 @@ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
 
     ## Estimating featurewise correlations ...
 
-    ## 
-
     ## Selecting candidate DE genes ...
 
     ## Note: The number of DE genes detected in the source data is 79 and the number of DE genes required to be included in the simulated data is 200. Therefore, candidiate DE genes are sampled with replacement.

diff --git a/data/scNGP.data.rda b/data/scNGP.data.rda
diff --git a/data/zhang.data.sub.rda b/data/zhang.data.sub.rda
diff --git a/man/SPsimSeq.Rd b/man/SPsimSeq.Rd
diff --git a/man/configExperiment.Rd b/man/configExperiment.Rd
diff --git a/man/dSPsimSeq.Rd b/man/dSPsimSeq.Rd
diff --git a/vignettes/SPsimSeq.Rmd b/vignettes/SPsimSeq.Rmd
@@ -1,4 +1,7 @@
 ---
+title: "SPsimSeq: semi-parametric simulation for bulk and single cell RNA-seq data"
+author: "Alemu Takele Assefa and Stijn Hawinkel"
+date: "`r Sys.Date()`"
 output: 
   rmarkdown::html_vignette:
     toc: true
@@ -51,7 +54,6 @@ library(SPsimSeq)
  MYCN.status  <- zhang.data.sub$MYCN.status 
  
  set.seed(6452)
- zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 3000), ]
  # We simulate only a single data (n.sim = 1) with the following property
  # - 3000 genes ( n.genes = 3000) 
  # - 172 samples (tot.samples = 172) -- equal to the source data
@@ -65,7 +67,7 @@ library(SPsimSeq)
  #    estimation (model.zero.prob = FALSE)
  
  # simulate data
- sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
+ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts,
                           group = MYCN.status, n.genes = 3000, batch.config = 1,
                           group.config = c(0.5, 0.5), tot.samples = 172, 
                           pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
@@ -181,14 +183,11 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
   data("scNGP.data")
  
  # filter genes with sufficient expression (important step to avoid bugs) 
- scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]  
- treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
+ treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
  
  set.seed(654321)
- scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 5000), ]
- 
  # simulate data (we simulate here only a single data, n.sim = 1)
- sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2,
+ sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data,
                          group = treatment, n.genes = 5000, batch.config = 1,
                          group.config = c(0.5, 0.5), tot.samples = 100, 
                          pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
@@ -205,7 +204,7 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
  # and fraction of zero counts per gene
  
 # normalize counts for comparison  
-Y0.log.cpm  <- log2(edgeR::cpm(counts(scNGP.data2))+1)
+Y0.log.cpm  <- log2(edgeR::cpm(counts(scNGP.data))+1)
 Y1.log.cpm  <- log2(edgeR::cpm(counts(sim.data.sc1))+1)
 Y0.log.cpm  <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
 Y1.log.cpm  <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
@@ -216,7 +215,7 @@ rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TR
 rowZeroFrac <- function(X){apply(X, 1, function(x) mean(x==0, na.rm=TRUE))}
  
 par(mfrow=c(1, 3))
-boxplot(list(real.data=colSums(counts(scNGP.data2)), 
+boxplot(list(real.data=colSums(counts(scNGP.data)), 
              simulated.data=colData(sim.data.sc1)$sim.Lib.Size), 
         main="library size") 
 boxplot(list(real.data=rowMeans(Y0.log.cpm), 
@@ -324,4 +323,4 @@ sessionInfo()
 
 # References
 
-References missing?
+\printbibliography
-Original file line number
+Diff line change
@@ Expand Up / @@ -68,8 +68,6 @@ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2, @@
         ## Estimating featurewise correlations ...
-        ##
         ## Selecting candidate DE genes ...
         ## Note: The number of DE genes detected in the source data is 79 and the number of DE genes required to be included in the simulated data is 200. Therefore, candidiate DE genes are sampled with replacement.
@@ Expand Down @@