-
Notifications
You must be signed in to change notification settings - Fork 7
/
FPR_mocks_eps_usoskin.Rmd
169 lines (138 loc) · 4.67 KB
/
FPR_mocks_eps_usoskin.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
---
title: "Case study on Usoskin dataset - FPR on mocks"
author: "Fanny Perraudeau"
date: "`r Sys.Date()`"
output:
html_document:
fig_height: 7
fig_width: 7
toc: yes
code_folding: hide
toc_float: yes
---
```{r options, echo=FALSE, results="hide",mesasge=FALSE, error=FALSE, include=FALSE, autodep=TRUE}
knitr::opts_chunk$set(fig.align="center", cache=TRUE, error=FALSE, message=FALSE, warning=TRUE)
library(edgeR)
library(zinbwave)
```
We want to evaluate the impact of the ridge penalty ($\epsilon$) of ZINB-WaVE model on the false positive rate (FPR) for the mock comparisons from the Usoskin dataset.
# Data
Dataset is from the supplementary data accompanying the original paper downloaded at http://linnarssonlab.org/drg/.
```{r data}
path = '../../datasets/usoskin/'
load(paste0(path, 'esetUsoskin.RData'))
eset=eset[rowSums(exprs(eset)>0)>=20,]
exprs(eset) <- apply(exprs(eset),2,function(x) {storage.mode(x) <- 'integer'; x})
file = paste0(path, "subsetMatrixUsoskinFPR_randomCellTypes.txt")
subsets <- read.table(file)
```
# Influence of epsilon
## Compute ZINB-WaVE observational weights
We add the batch (picking session) as a covariate in the ZINB-WaVE model.
```{r dataEloop}
i = 1
eLoop <- eset[,as.numeric(subsets[i,])]
cat('Removing ', sum(rowSums(exprs(eLoop)) == 0), " genes with only 0's")
eLoop <- eLoop[rowSums(exprs(eLoop)) != 0, ]
condition=factor(rep(c("A","B"),each=45))
pickingSession=factor(rep(rep(c("Cold","rt1","rt2"),each=15),2))
pData(eLoop)$condition=condition
pData(eLoop)$pickingSession=pickingSession
design <- model.matrix(~ condition + pickingSession)
epsVec = 10^seq(0, 14, by = 2)
```
```{r zinbesp,eval=FALSE}
library(BiocParallel)
library(doParallel)
NCORES <- 2
registerDoParallel(NCORES)
register(DoparParam())
zinbList <- lapply(epsVec, function(eps){
zinbFit(exprs(eLoop), X = design, epsilon = eps)
})
save(zinbList, file = 'zinbList_epsilon.rda')
```
```{r}
load('zinbList_epsilon.rda')
zinbList = zinbList[1:length(epsVec)]
```
```{r pvals}
weightsList <- lapply(zinbList, function(x){
computeObservationalWeights(x, exprs(eLoop))
})
```
## Compute p-values
We use egdeR with ZINB-WaVE posterior probabilities as observation weights.
```{r}
pvalsList <- lapply(weightsList, function(w){
d <- DGEList(exprs(eLoop))
d <- edgeR::calcNormFactors(d)
d$weights <- w
d=estimateDisp(d, design)
fit=glmFit(d,design)
lrt=glmWeightedF(fit, coef=2, independentFiltering = TRUE)
lrt$table$PValue
})
```
## Biological coefficient of variation (BCV)
```{r mocksEspilonBCV}
par(mfrow=c(3,4))
myplot <- lapply(1:length(weightsList), function(i){
d <- DGEList(exprs(eLoop))
d <- edgeR::calcNormFactors(d)
w = weightsList[[i]]
w[is.na(w)] = 1
d$weights <- w
d=estimateDisp(d, design)
plotBCV(d, main = paste0('epsilon=', epsVec[i]), ylim = c(0,6))
})
par(mfrow=c(1,1))
```
## Posterior probabilities
```{r mocksEspilonWeights}
par(mfrow = c(3,4))
hh = lapply(1:length(weightsList), function(i){
hist(weightsList[[i]][exprs(eLoop)==0], main = paste0('epsilon=', epsVec[i]), ylim = c(0,5e5))
})
par(mfrow = c(1,1))
```
## FPR
```{r fdrEpsUsoskinEps}
fpr = sapply(pvalsList, function(x) mean(x <= 0.05))
print(length(fpr))
plot(log10(epsVec), fpr, main = '', type = 'o',
xlab = expression(paste(epsilon, ' (log10)')), ylab = 'PCER',cex.lab = 1.5,cex.axis= 1.5)
abline(h = 0.05, col = 'red')
```
## Histogram of p-values.
We are expecting uniform distribution for the p-values as the 2 groups contain cells that have been randomly sampled from all the cells. Therefore, the true number of DE genes is null. When $\epsilon < 10^6$, p-values are not uniformly distributed.
```{r fdrEpsUsoskinPval}
par(mfrow = c(3,3))
hh = lapply(1:length(weightsList), function(i){
hist(pvalsList[[i]], main = paste0('epsilon=', epsVec[i]), ylim=c(0,2000),cex.lab = 1.5,cex.axis= 1.5,cex.main=2, xlab='p-values')
})
par(mfrow = c(1,1))
```
## Plot paper
```{r fdrEpsUsoskin,fig.width=10,fig.height=5}
par(mar = c(4.1, 4.25, 3, 1), bty = "l")
layout(cbind(matrix(1,3,3), matrix(2:10,3,3,byrow = TRUE)))
# pcer = f(eps)
fpr = sapply(pvalsList, function(x) mean(x <= 0.05))
print(length(fpr))
plot(log10(epsVec), fpr, main = '', type = 'o',
xlab = expression(paste(epsilon,' (log10)')), ylab = 'PCER',cex.lab = 1.5,cex.axis= 1.5,cex.main=1.5)
abline(h = 0.05, col = 'red')
mtext("a" ,at=-2, font=2, padj = -1,cex=4/3)
# hist pvalues
hh = lapply(1:length(epsVec), function(i){
hist(pvalsList[[i]], main = paste0('epsilon=', epsVec[i]), ylim=c(0,2000),cex.lab = 1.5,cex.axis= 1.5,cex.main=1.5, xlab = 'p-values')
if (i==1) mtext("b" ,at=-1,padj = -1, font=2, cex=4/3)
})
```
## Conclusion
$\epsilon>10^6$ seems to be reasonable.
# sessionInfo()
```{r}
sessionInfo()
```