Skip to content

Commit

Permalink
Reducing data size
Browse files Browse the repository at this point in the history
  • Loading branch information
sthawinke committed Mar 27, 2020
1 parent 14eb5cf commit 95c169c
Show file tree
Hide file tree
Showing 13 changed files with 20 additions and 120 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Imports:
WGCNA,
limma,
mvtnorm,
phyloseq ,
phyloseq,
utils
biocViews: GeneExpression, RNASeq, SingleCell, Sequencing, DNASeq
RoxygenNote: 6.1.1
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

export(SPsimSeq)
export(configExperiment)
export(dSPsimSeq)
export(evaluateDensities)
import(methods)
importFrom(Hmisc,cut2)
importFrom(SingleCellExperiment,SingleCellExperiment)
Expand Down
11 changes: 4 additions & 7 deletions R/SPsimSeq.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
#' data("zhang.data.sub")
#'
#' # filter genes with sufficient expression (important step to avoid bugs)
#' zhang.counts <- zhang.data.sub$counts[rowSums(zhang.data.sub$counts > 0)>=5, ]
#' zhang.counts <- zhang.data.sub$counts
#' MYCN.status <- zhang.data.sub$MYCN.status
#'
#' # We simulate only a single data (n.sim = 1) with the following property
Expand All @@ -100,13 +100,11 @@
#'
#' # simulate data
#' set.seed(6452)
#' zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 2000), ]
#' sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
#' group = MYCN.status, n.genes = 2000, batch.config = 1,
#' group.config = c(0.5, 0.5), tot.samples = 20,
#' pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
#'
#'
#' head(sim.data.bulk$counts[[1]][, seq_len(5)]) # count data
#' head(sim.data.bulk$colData) # sample info
#' head(sim.data.bulk$rowData) # gene info
Expand All @@ -131,14 +129,13 @@
#' data("scNGP.data")
#'
#' # filter genes with sufficient expression (important step to avoid bugs)
#' scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]
#' treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
#' treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
#'
#' set.seed(654321)
#' scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 2000), ]
#' scNGP.data <- scNGP.data[sample(nrow(scNGP.data), 2000), ]
#'
#' # simulate data (we simulate here only a single data, n.sim = 1)
#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2, group = treatment,
#' sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data, group = treatment,
#' n.genes = 2000, batch.config = 1,
#' group.config = c(0.5, 0.5), tot.samples = 100,
#' pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
Expand Down
2 changes: 0 additions & 2 deletions R/configExperiment.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#' and the experiment configurartion
#' @export
#' @examples
#' \donttest{
#' batch = sample(LETTERS[1:3], 20, replace = TRUE)
#' group = sample(1:3, 20, replace = TRUE)
#' #---- a design with a total of 10 samples/cells from 1 batch and 1 group
Expand All @@ -34,7 +33,6 @@
#' # respectively.
#' configExperiment(batch.config = c(5/30, 10/30, 15/30), group.config = c(0.5, 0.5),
#' tot.samples = 30, batch = batch, group = group)
#' }
configExperiment <- function(batch.config, group.config, tot.samples, batch, group){
#Sort, such that largest groups and largest batches match with simulation
batch.config = sort(batch.config, decreasing = TRUE)
Expand Down
45 changes: 0 additions & 45 deletions R/dSPsimSeq.R

This file was deleted.

2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ BioConductor installation

```{r biocinstall, eval = FALSE}
library(BiocManager)
BiocManager::install("SPsimSeq", update = FALSE)
BiocManager::install("SPsimSeq")
```

```{r loadspsimseqpackage}
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,

## Estimating featurewise correlations ...

##

## Selecting candidate DE genes ...

## Note: The number of DE genes detected in the source data is 79 and the number of DE genes required to be included in the simulated data is 200. Therefore, candidiate DE genes are sampled with replacement.
Expand Down
Binary file modified data/scNGP.data.rda
Binary file not shown.
Binary file modified data/zhang.data.sub.rda
Binary file not shown.
11 changes: 4 additions & 7 deletions man/SPsimSeq.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions man/configExperiment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 0 additions & 42 deletions man/dSPsimSeq.Rd

This file was deleted.

19 changes: 9 additions & 10 deletions vignettes/SPsimSeq.Rmd
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
---
title: "SPsimSeq: semi-parametric simulation for bulk and single cell RNA-seq data"
author: "Alemu Takele Assefa and Stijn Hawinkel"
date: "`r Sys.Date()`"
output:
rmarkdown::html_vignette:
toc: true
Expand Down Expand Up @@ -51,7 +54,6 @@ library(SPsimSeq)
MYCN.status <- zhang.data.sub$MYCN.status
set.seed(6452)
zhang.counts2 <- zhang.counts[sample(nrow(zhang.counts), 3000), ]
# We simulate only a single data (n.sim = 1) with the following property
# - 3000 genes ( n.genes = 3000)
# - 172 samples (tot.samples = 172) -- equal to the source data
Expand All @@ -65,7 +67,7 @@ library(SPsimSeq)
# estimation (model.zero.prob = FALSE)
# simulate data
sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts2,
sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts,
group = MYCN.status, n.genes = 3000, batch.config = 1,
group.config = c(0.5, 0.5), tot.samples = 172,
pDE = 0.1, lfc.thrld = 0.5, result.format = "list")
Expand Down Expand Up @@ -181,14 +183,11 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
data("scNGP.data")
# filter genes with sufficient expression (important step to avoid bugs)
scNGP.data2 <- scNGP.data[rowSums(counts(scNGP.data) > 0)>=5, ]
treatment <- ifelse(scNGP.data2$characteristics..treatment=="nutlin",2,1)
treatment <- ifelse(scNGP.data$characteristics..treatment=="nutlin",2,1)
set.seed(654321)
scNGP.data2 <- scNGP.data2[sample(nrow(scNGP.data2), 5000), ]
# simulate data (we simulate here only a single data, n.sim = 1)
sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data2,
sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data,
group = treatment, n.genes = 5000, batch.config = 1,
group.config = c(0.5, 0.5), tot.samples = 100,
pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
Expand All @@ -205,7 +204,7 @@ points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.
# and fraction of zero counts per gene
# normalize counts for comparison
Y0.log.cpm <- log2(edgeR::cpm(counts(scNGP.data2))+1)
Y0.log.cpm <- log2(edgeR::cpm(counts(scNGP.data))+1)
Y1.log.cpm <- log2(edgeR::cpm(counts(sim.data.sc1))+1)
Y0.log.cpm <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
Y1.log.cpm <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
Expand All @@ -216,7 +215,7 @@ rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TR
rowZeroFrac <- function(X){apply(X, 1, function(x) mean(x==0, na.rm=TRUE))}
par(mfrow=c(1, 3))
boxplot(list(real.data=colSums(counts(scNGP.data2)),
boxplot(list(real.data=colSums(counts(scNGP.data)),
simulated.data=colData(sim.data.sc1)$sim.Lib.Size),
main="library size")
boxplot(list(real.data=rowMeans(Y0.log.cpm),
Expand Down Expand Up @@ -324,4 +323,4 @@ sessionInfo()

# References

References missing?
\printbibliography

0 comments on commit 95c169c

Please sign in to comment.