-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata preprocessing.rmd
201 lines (167 loc) · 6.25 KB
/
data preprocessing.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
---
title: "RNA-Seq WGCNA"
output: html_document
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
library(WGCNA)
library(DESeq2)
library(tidyverse); library(limma)
```
```{r Data Preprocessing }
counts_ALL <- read.csv("combined_All_counts.csv")
rownames(counts_ALL) <- counts_ALL$target_id
counts_ALL <- counts_ALL[,-1]
# Filter out feature if there are less than 100 reads in more than 90% of samples
filtered.counts <- counts_ALL[rowSums(counts_ALL<100)<18,]
# Splitting the samples into different organ types
leaf <- filtered.counts[,c(3, 7, 11)]
stem <- filtered.counts[,c(1, 5, 9)]
root <- filtered.counts[,c(4, 8, 12, 15, 16, 17, 18)]
flower <- filtered.counts[,c(2, 6, 10, 13, 14, 19, 20)]
# Normalizing function
leafvoom = t(voom(leaf)$E)
stemvoom = t(voom(stem)$E)
rootvoom = t(voom(root)$E)
flowervoom = t(voom(flower)$E)
# Checks for genes and samples with too many missing values
# gsg = goodSamplesGenes(datExpr0)
# gsg$allOK
save(filtered.counts, datExpr0, leafvoom, stemvoom, rootvoom, flowervoom, file = "dataInput.RData");
```
```{r}
# We work with two sets:
nSets = 4;
# For easier labeling of plots, create a vector holding descriptive names of the two sets.
setLabels = c("Leaf", "Stem", "Root", "Flower")
shortLabels = setLabels
# Form multi-set expression data: columns starting from 9 contain actual expression data.
multiExpr = vector(mode = "list", length = nSets)
multiExpr[[1]] = list(data = as.data.frame(leafvoom))
names(multiExpr[[1]]$data) = colnames(leafvoom)
rownames(multiExpr[[1]]$data) = rownames(leafvoom)
multiExpr[[2]] = list(data = as.data.frame(stemvoom))
names(multiExpr[[2]]$data) = colnames(stemvoom)
rownames(multiExpr[[2]]$data) = rownames(stemvoom)
multiExpr[[3]] = list(data = as.data.frame(rootvoom))
names(multiExpr[[3]]$data) = colnames(rootvoom)
rownames(multiExpr[[3]]$data) = rownames(rootvoom)
multiExpr[[4]] = list(data = as.data.frame(flowervoom))
names(multiExpr[[4]]$data) = colnames(flowervoom)
rownames(multiExpr[[4]]$data) = rownames(flowervoom)
# Check that the data has the correct format for many functions operating on multiple sets:
exprSize = checkSets(multiExpr)
```
# Check the cluster of the samples (gene clustering will be later)
Mainly to check for outlier samples
```{r}
sampleTrees = list()
for (set in 1:nSets) {
sampleTrees[[set]] = hclust(dist(multiExpr[[set]]$data), method = "average")
}
```
```{r}
pdf(file = "Plots/SampleClustering.pdf", width = 12, height = 12);
par(mfrow=c(2,1))
par(mar = c(0, 4, 2, 0))
for (set in 1:nSets)
plot(sampleTrees[[set]], main = paste("Sample clustering on all genes in", setLabels[set]),
xlab="", sub="", cex = 0.7);
dev.off();
```
# Plot to see what soft-thresholding powers would be appropriate
```{r}
# Choose a set of soft-thresholding powers
powers = c(seq(4,10,by=1), seq(12,20, by=2));
# Initialize a list to hold the results of scale-free analysis
powerTables = vector(mode = "list", length = nSets);
# Call the network topology analysis function for each set in turn
for (set in 1:nSets)
powerTables[[set]] = list(data = pickSoftThreshold(multiExpr[[set]]$data, powerVector=powers,
verbose = 2)[[2]]);
collectGarbage();
# Plot the results:
colors = c("black", "red", "blue", "green")
# Will plot these columns of the returned scale free analysis tables
plotCols = c(2,5,6,7)
colNames = c("Scale Free Topology Model Fit", "Mean connectivity", "Median connectivity",
"Max connectivity");
# Get the minima and maxima of the plotted points
ylim = matrix(NA, nrow = 2, ncol = 4);
for (set in 1:nSets) {
for (col in 1:length(plotCols)) {
ylim[1, col] = min(ylim[1, col], powerTables[[set]]$data[, plotCols[col]], na.rm = TRUE);
ylim[2, col] = max(ylim[2, col], powerTables[[set]]$data[, plotCols[col]], na.rm = TRUE);
}
}
# Plot the quantities in the chosen columns vs. the soft thresholding power
sizeGrWindow(8, 6)
pdf(file = "Plots/scaleFreeAnalysis.pdf", wi = 8, he = 6)
par(mfcol = c(2,2));
par(mar = c(4.2, 4.2 , 2.2, 0.5))
cex1 = 0.7;
for (col in 1:length(plotCols)) for (set in 1:nSets) {
if (set==1) {
plot(powerTables[[set]]$data[,1], -sign(powerTables[[set]]$data[,3])*powerTables[[set]]$data[,2],
xlab="Soft Threshold (power)",ylab=colNames[col],type="n", ylim = ylim[, col],
main = colNames[col]);
addGrid();
}
if (col==1) {
text(powerTables[[set]]$data[,1], -sign(powerTables[[set]]$data[,3])*powerTables[[set]]$data[,2],
labels=powers,cex=cex1,col=colors[set]);
} else
text(powerTables[[set]]$data[,1], powerTables[[set]]$data[,plotCols[col]],
labels=powers,cex=cex1,col=colors[set]);
if (col==1) {
legend("bottomright", legend = setLabels, col = colors, pch = 20) ;
} else
legend("topright", legend = setLabels, col = colors, pch = 20) ;
}
dev.off();
```
Leaf and Shoot do not have enough samples. Try just with flower and root
```{r}
multiExprFR = vector(mode = "list", length = 2)
multiExprFR[[1]] = list(data = as.data.frame(rootvoom))
names(multiExprFR[[1]]$data) = colnames(rootvoom)
rownames(multiExprFR[[1]]$data) = rownames(rootvoom)
multiExprFR[[2]] = list(data = as.data.frame(flowervoom))
names(multiExprFR[[2]]$data) = colnames(flowervoom)
rownames(multiExprFR[[2]]$data) = rownames(flowervoom)
save(multiExprFR, file = "multiExprFR.RData");
```
# Build the modules
```{r}
net = blockwiseConsensusModules(
multiExprFR, power = 10, minModuleSize = 30, deepSplit = 2,
pamRespectsDendro = FALSE,
mergeCutHeight = 0.25, numericLabels = TRUE,
minKMEtoStay = 0,
saveTOMs = TRUE, verbose = 5)
table(net$colors)
```
# Important info from previous step output
```{r}
consMEs = net$multiMEs
moduleLabels = net$colors
# Convert the numeric labels to color labels
moduleColors = labels2colors(moduleLabels)
consTree = net$dendrograms[[1]]
```
# Plot the first block only (~5000 genes)
```{r}
sizeGrWindow(8,6);
pdf(file = "Plots/ConsensusDendrogram-auto.pdf", wi = 8, he = 6)
plotDendroAndColors(dendro = consTree, colors = moduleColors[1:4999],
groupLabels = "Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Consensus gene dendrogram and module colors")
dev.off()
```
# Save data for future use
```{r}
save(consMEs, moduleLabels, moduleColors, consTree, file = "Consensus-NetworkConstruction-auto.RData")
```