Skip to content

Commit

Permalink
Reducing data size
Browse files Browse the repository at this point in the history
Former-commit-id: 95c169c
Former-commit-id: ac586a15c9ab547b03c782d2fec6fb73615e4421
Former-commit-id: f02bb2f3ed0917a5cb97e8eb1f650064a3f729cc
Former-commit-id: a0aa2f8
Former-commit-id: cbbaf371263a0cd187be254b1140dbd0a2e32522
Former-commit-id: 714267e
  • Loading branch information
sthawinke committed Mar 27, 2020
1 parent ff684a9 commit 864f8d0
Show file tree
Hide file tree
Showing 14 changed files with 21 additions and 122 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Imports:
WGCNA,
limma,
mvtnorm,
phyloseq ,
phyloseq,
utils
biocViews: GeneExpression, RNASeq, SingleCell, Sequencing, DNASeq
RoxygenNote: 6.1.1
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

export(SPsimSeq)
export(configExperiment)
export(dSPsimSeq)
export(evaluateDensities)
import(methods)
importFrom(Hmisc,cut2)
importFrom(SingleCellExperiment,SingleCellExperiment)
Expand Down
11 changes: 4 additions & 7 deletions R/SPsimSeq.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
#' data("zhang.data.sub")
#'
#' # filter genes with sufficient expression (important step to avoid bugs)
#' zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ]
#' zhang.counts <- zhang.data.sub$counts
#' MYCN.status <- zhang.data.sub$MYCN.status
#'
#' # We simulate only a single data (n.sim = 1) with the following property
Expand All @@ -100,13 +100,11 @@
#'
#' # simulate data
#' set.seed(6452)
#' zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ]
#' sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
#' group = MYCN.status, n.genes = 2000, batch.config = 1,
#' group.config = c(0.5, 0.5), tot.samples = 20,
#' pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
#'
#'
#' head(sim.data.bulk$counts[[1]][, seq_len(5)]) # count data
#' head(sim.data.bulk$colData) # sample info
#' head(sim.data.bulk$rowData) # gene info
Expand All @@ -131,14 +129,13 @@
#' data("scNGP.data")
#'
#' # filter genes with sufficient expression (important step to avoid bugs)
#' scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]
#' treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
#' treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
#'
#' set.seed(654321)
#' scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 2000), ]
#' scNGP.data <- scNGP.data[sample(nrow(scNGP.data), 2000), ]
#'
#' # simulate data (we simulate here only a single data, n.sim = 1)
#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2, group = treatment,
#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data, group = treatment,
#' n.genes = 2000, batch.config = 1,
#' group.config = c(0.5, 0.5), tot.samples = 100,
#' pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
Expand Down
2 changes: 0 additions & 2 deletions R/configExperiment.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#' and the experiment configurartion
#' @export
#' @examples
#' \donttest{
#' batch = sample(LETTERS[1:3], 20, replace = TRUE)
#' group = sample(1:3, 20, replace = TRUE)
#' #---- a design with a total of 10 samples/cells from 1 batch and 1 group
Expand All @@ -34,7 +33,6 @@
#' # respectively.
#' configExperiment(batch.config = c(5/30, 10/30, 15/30), group.config = c(0.5, 0.5),
#' tot.samples = 30, batch = batch, group = group)
#' }
configExperiment <- function(batch.config, group.config, tot.samples, batch, group){
#Sort, such that largest groups and largest batches match with simulation
batch.config = sort(batch.config, decreasing = TRUE)
Expand Down
45 changes: 0 additions & 45 deletions R/dSPsimSeq.R

This file was deleted.

2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ BioConductor installation

```{r biocinstall, eval = FALSE}
library(BiocManager)
BiocManager::install("SPsimSeq", update = FALSE)
BiocManager::install("SPsimSeq")
```

```{r loadspsimseqpackage}
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,

## Estimating featurewise correlations ...

##

## Selecting candidate DE genes ...

## Note: The number of DE genes detected in the source data is 79 and the number of DE genes required to be included in the simulated data is 200. Therefore, candidiate DE genes are sampled with replacement.
Expand Down
2 changes: 1 addition & 1 deletion data/scNGP.data.rda.REMOVED.git-id
Original file line number Diff line number Diff line change
@@ -1 +1 @@
a4a3ab7f514bfc91ae71bd58dfb04b4529885753
4bd374a6e2e00cc4b65d0d8f85ce43ad41f2fdce
Binary file added data/zhang.data.sub.rda
Binary file not shown.
1 change: 0 additions & 1 deletion data/zhang.data.sub.rda.REMOVED.git-id

This file was deleted.

11 changes: 4 additions & 7 deletions man/SPsimSeq.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions man/configExperiment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 0 additions & 42 deletions man/dSPsimSeq.Rd

This file was deleted.

19 changes: 9 additions & 10 deletions vignettes/SPsimSeq.Rmd
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
---
title: "SPsimSeq: semi-parametric simulation for bulk and single cell RNA-seq data"
author: "Alemu Takele Assefa and Stijn Hawinkel"
date: "`r Sys.Date()`"
output:
rmarkdown::html_vignette:
toc: true
Expand Down Expand Up @@ -51,7 +54,6 @@ library(SPsimSeq)
MYCN.status <- zhang.data.sub$MYCN.status
set.seed(6452)
zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 3000), ]
# We simulate only a single data (n.sim = 1) with the following property
# - 3000 genes ( n.genes = 3000)
# - 172 samples (tot.samples = 172) -- equal to the source data
Expand All @@ -65,7 +67,7 @@ library(SPsimSeq)
# estimation (model.zero.prob = FALSE)
# simulate data
sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts,
group = MYCN.status, n.genes = 3000, batch.config = 1,
group.config = c(0.5, 0.5), tot.samples = 172,
pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
Expand Down Expand Up @@ -181,14 +183,11 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
data("scNGP.data")
# filter genes with sufficient expression (important step to avoid bugs)
scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]
treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
set.seed(654321)
scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 5000), ]
# simulate data (we simulate here only a single data, n.sim = 1)
sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2,
sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data,
group = treatment, n.genes = 5000, batch.config = 1,
group.config = c(0.5, 0.5), tot.samples = 100,
pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
Expand All @@ -205,7 +204,7 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
# and fraction of zero counts per gene
# normalize counts for comparison
Y0.log.cpm <- log2(edgeR::cpm(counts(scNGP.data2))+1)
Y0.log.cpm <- log2(edgeR::cpm(counts(scNGP.data))+1)
Y1.log.cpm <- log2(edgeR::cpm(counts(sim.data.sc1))+1)
Y0.log.cpm <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
Y1.log.cpm <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
Expand All @@ -216,7 +215,7 @@ rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TR
rowZeroFrac <- function(X){apply(X, 1, function(x) mean(x==0, na.rm=TRUE))}
par(mfrow=c(1, 3))
boxplot(list(real.data=colSums(counts(scNGP.data2)),
boxplot(list(real.data=colSums(counts(scNGP.data)),
simulated.data=colData(sim.data.sc1)$sim.Lib.Size),
main="library size")
boxplot(list(real.data=rowMeans(Y0.log.cpm),
Expand Down Expand Up @@ -324,4 +323,4 @@ sessionInfo()

# References

References missing?
\printbibliography

0 comments on commit 864f8d0

Please sign in to comment.