-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathtransform_counts_demo.Rmd
executable file
·183 lines (142 loc) · 7.07 KB
/
transform_counts_demo.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
---
title: "Process sc-RNAseq read counts for scLVM"
author: "Florian Buettner, Kedar N. Natarajan, F. Paolo Casale, Valentina Proserpio, Antonio Scialdone, Fabian J. Theis, Sarah A. Teichmann, John C. Marioni and Oliver Stegle"
#date: "7. Oktober 2014"
output: html_document
---
In order to run scLVM in python as outlined in the demo notebook, it is necessary to pre-process the data which is typically done in R. In the following exmaple script we illustrate how this pre-processing was performed for the T-cell data.
First, we need to load some required packages.
```{r,message=FALSE}
library(statmod)
library(gdata)
library(genefilter)
library(EBImage)
library(rhdf5)
library(DESeq)
library(statmod)
library(hom.Hs.inp.db)
library(AnnotationDbi)
library(org.Mm.eg.db)
```
Now, we load two data frames containing, one containing a list of cell cycle genes annotated in Cyclebase, the other one containing mapped read counts for the 81 T-cells described in the paper.
```{r}
load('./data_Tcells.Rdata')
```
Next, we look for the spike-ins which we then use to normalise the data. We omitted the normalization for cell size as proposed in Brennecke et al. 2013, because the computational correction by scLVM yielded much better results . This is likely explained by noting that cell size and cell cycle are correlated, thus the normalization proposed by Brennecke et al. reduces the amount of information available for inferring cell-cell correlations due to cell cycle
```{r}
dataMouse[ 1:5, 1:4 ]
geneTypes <- factor( c( ENSM="ENSM", ERCC="ERCC" )[
substr( rownames(dataMouse), 1, 4 ) ] )
#2. calculate normalisation for counts
countsMmus <- dataMouse[ which( geneTypes=="ENSM" ), ]
countsERCC <- dataMouse[ which( geneTypes=="ERCC" ), ]
lengthsMmus <- dataMouse[ which( geneTypes=="ENSM" ), 1 ]
lengthsERCC <- dataMouse[ which( geneTypes=="ERCC" ), 1 ]
sfERCC <- estimateSizeFactorsForMatrix( countsERCC )
sfMmus <- sfERCC #also use ERCC size factor for endogenous genes
#normalise read counts
nCountsERCC <- t( t(countsERCC) / sfERCC )
nCountsMmus <- t( t(countsMmus) / sfMmus )
```
The next step will be fitting the technical noise model proposed by Brennecke et al 2013.
```{r}
#normalized counts (brennecke)
meansMmus <- rowMeans( nCountsMmus )
varsMmus <- rowVars( nCountsMmus )
cv2Mmus <- varsMmus / meansMmus^2
meansERCC <- rowMeans( nCountsERCC )
varsERCC <- rowVars( nCountsERCC )
cv2ERCC <- varsERCC / meansERCC^2
#Do fitting of technical noise
#normalised counts (with size factor)
minMeanForFitA <- unname( quantile( meansERCC[ which( cv2ERCC > .3 ) ], .8 ) )
useForFitA <- meansERCC >= minMeanForFitA
fitA <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/meansERCC[useForFitA] ),
cv2ERCC[useForFitA] )
#plot fit
plot( meansERCC, cv2ERCC, log="xy", col=1+useForFitA)
xg <- 10^seq( -3, 5, length.out=100 )
lines( xg, coefficients(fitA)["a0"] + coefficients(fitA)["a1tilde"]/xg )
segments( meansERCC[useForFitA], cv2ERCC[useForFitA],
meansERCC[useForFitA], fitA$fitted.values, col="gray" )
```
Once we have fitted the technical noise model we can test for genes where the biological variation is significantly larger than technical noise. Here, we follow Brennecke et al. 2013.
```{r}
#perfrom statistical test
minBiolDisp <- .5^2
xi <- mean( 1 / sfERCC )
m <- ncol(countsMmus)
psia1thetaA <- mean( 1 / sfERCC ) +
( coefficients(fitA)["a1tilde"] - xi ) * mean( sfERCC / sfMmus )
cv2thA <- coefficients(fitA)["a0"] + minBiolDisp + coefficients(fitA)["a0"] * minBiolDisp
testDenomA <- ( meansMmus * psia1thetaA + meansMmus^2 * cv2thA ) / ( 1 + cv2thA/m )
pA <- 1 - pchisq( varsMmus * (m-1) / testDenomA, m-1 )
padjA <- p.adjust( pA, "BH" )
table( padjA < .1 )
#plot mean/cv2 relationship and
plot( meansMmus, cv2Mmus, log="xy", col=1+(padjA<0.1),ylim=c(0.1,95), xlab='Mean Counts', ylab='CV2 Counts')
xg <- 10^seq( -3, 5, length.out=100 )
lines( xg, coefficients(fitA)["a0"] + coefficients(fitA)["a1tilde"]/xg,lwd=2,col='blue' )
points(meansERCC, cv2ERCC,col='blue',pch=15,cex=1.1)
#points(meansMmus[cc_gene_indices], cv2Mmus[cc_gene_indices],col=rgb(0,255,0,100,maxColorValue=255),pch=2,cex=0.75)
#points(meansMmus[ccCBall_gene_indices], cv2Mmus[ccCBall_gene_indices],col=rgb(0,255,0,20,maxColorValue=255),pch=2,cex=0.8)
legend('bottomleft',c('T-cells (padj >= 0.1)','T-cells (padj<0.1)','ERCC','Cell Cycle genes'),pch=c(1,1,15),col=c('black','red','blue','green'),cex=0.7)
```
So far we have fitted the noise model in the count space. As scLVM works in log space, we need to transform the counts and the variance to the log space.
```{r}
#4. Transform to log-space and propagate error
eps=1
LogNcountsMmus=log10(nCountsMmus+eps)
dLogNcountsMmus=1/(meansMmus+eps)
var_techMmus=(coefficients(fitA)["a0"] + coefficients(fitA)["a1tilde"]/meansMmus)*meansMmus^2
LogVar_techMmus=(dLogNcountsMmus*sqrt(var_techMmus))^2 #error propagation
```
In order to fit the latent cell cycle factor we need to retrieve cell cycle genes annotated in Cyclebase and GO. First, we process the Cyclebase data and get the mouse homologs.
```{r, message=FALSE}
#gene names in the T-cell data.
gene_names=rownames(nCountsMmus)
gene_names_het=gene_names[which(padjA<0.1)]
#all Cycle base genes homologs (top 600 genes)
hu2musAll=inpIDMapper(dataCB[1:600,3],'HOMSA','MUSMU',srcIDType='ENSEMBL',destIDType='ENSEMBL')
ccCBall_gene_indices=match(unlist(hu2musAll),rownames(nCountsMmus))
#lenCB=unlist(((lapply(hu2musAll,function(x){length(x)}))))
```
In addition we retrieve cell cycle genes from GO:
```{r}
#get cell cycle genes from GO
xxGO <- as.list(org.Mm.egGO2EG)
cell_cycleEG <-unlist(xxGO['GO:0007049'])
#get ENSEMBLE ids
x <- org.Mm.egENSEMBL
mapped_genes <- mappedkeys(x)
xxE <- as.list(x[mapped_genes])
ens_ids_cc<-unlist(xxE[cell_cycleEG])
cc_gene_indices <- na.omit(match(ens_ids_cc, rownames(dataMouse)))
```
Now we just need to convert ENSEMBL IDs to gene symbols and then save everything as hdf5 file.
```{r}
#ensemble IDs to gene symbols
x <- org.Mm.egSYMBOL
# Get the gene symbol that are mapped to an entrez gene identifiers
mapped_genes <- mappedkeys(x)
# Convert to a list
xx <- as.list(x[mapped_genes])
xxenseg <- as.list(org.Mm.egENSEMBL2EG)
gene_syms=unlist(xx[unlist(xxenseg[gene_names])])
gene_names_list<-(lapply(xxenseg[gene_names],function(x){if(is.null(x)){x=NA}else{x=x[1]}}))
sym_names=unlist(lapply(xx[unlist(gene_names_list)],function(x){if(is.null(x)){x=NA}else{x=x[1]}}))
sym_names[is.na(sym_names)]=gene_names[is.na(sym_names)]
sym_names_het=sym_names[which(padjA<0.1)] #gene symbols of variable genes
```
Rename a few variables and save data. The saved hdf5 file can then be used in scLVM s illustrated in the ipython notebook.
```{r}
#rename a few variables...
cellcyclegenes <- ens_ids_cc
cellcyclegenes_filter <- cc_gene_indices
cell_names <- colnames(nCountsMmus)
Y <- nCountsMmus
genes_heterogen <- (padjA<0.1)*1
countsERCC_mat=as.matrix(countsERCC * 1)
countsMmus_mat = as.matrix(countsMmus * 1)
h5save(ccCBall_gene_indices,gene_names,sym_names,sym_names_het,cellcyclegenes_filter,cellcyclegenes,cell_names,nCountsMmus,genes_heterogen,LogVar_techMmus,LogNcountsMmus,countsMmus_mat,sfERCC,countsERCC_mat,file='normCountsMMus_final_test.h5f')
```