data preprocessing.rmd

---
title: "RNA-Seq WGCNA"
output: html_document
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
library(WGCNA)
library(DESeq2)
library(tidyverse); library(limma)
```

```{r Data Preprocessing }
counts_ALL <- read.csv("combined_All_counts.csv")
rownames(counts_ALL) <- counts_ALL$target_id
counts_ALL <- counts_ALL[,-1]

# Filter out feature if there are less than 100 reads in more than 90% of samples
filtered.counts <- counts_ALL[rowSums(counts_ALL<100)<18,]

# Splitting the samples into different organ types
leaf <- filtered.counts[,c(3, 7, 11)]
stem <- filtered.counts[,c(1, 5, 9)]
root <- filtered.counts[,c(4, 8, 12, 15, 16, 17, 18)]
flower <- filtered.counts[,c(2, 6, 10, 13, 14, 19, 20)]

# Normalizing function
leafvoom = t(voom(leaf)$E)
stemvoom = t(voom(stem)$E)
rootvoom = t(voom(root)$E)
flowervoom = t(voom(flower)$E)

# Checks for genes and samples with too many missing values
# gsg = goodSamplesGenes(datExpr0)
# gsg$allOK

save(filtered.counts, datExpr0, leafvoom, stemvoom, rootvoom, flowervoom, file = "dataInput.RData");
```

```{r}
# We work with two sets:
nSets = 4;
# For easier labeling of plots, create a vector holding descriptive names of the two sets.
setLabels = c("Leaf", "Stem", "Root", "Flower")
shortLabels = setLabels
# Form multi-set expression data: columns starting from 9 contain actual expression data.
multiExpr = vector(mode = "list", length = nSets)

multiExpr[[1]] = list(data = as.data.frame(leafvoom))
names(multiExpr[[1]]$data) = colnames(leafvoom)
rownames(multiExpr[[1]]$data) = rownames(leafvoom)

multiExpr[[2]] = list(data = as.data.frame(stemvoom))
names(multiExpr[[2]]$data) = colnames(stemvoom)
rownames(multiExpr[[2]]$data) = rownames(stemvoom)

multiExpr[[3]] = list(data = as.data.frame(rootvoom))
names(multiExpr[[3]]$data) = colnames(rootvoom)
rownames(multiExpr[[3]]$data) = rownames(rootvoom)

multiExpr[[4]] = list(data = as.data.frame(flowervoom))
names(multiExpr[[4]]$data) = colnames(flowervoom)
rownames(multiExpr[[4]]$data) = rownames(flowervoom)

# Check that the data has the correct format for many functions operating on multiple sets:
exprSize = checkSets(multiExpr)
```

# Check the cluster of the samples (gene clustering will be later)
Mainly to check for outlier samples

```{r}
sampleTrees = list()
for (set in 1:nSets) {
  sampleTrees[[set]] = hclust(dist(multiExpr[[set]]$data), method = "average")
}
```

```{r}
pdf(file = "Plots/SampleClustering.pdf", width = 12, height = 12);
par(mfrow=c(2,1))
par(mar = c(0, 4, 2, 0))
for (set in 1:nSets)
  plot(sampleTrees[[set]], main = paste("Sample clustering on all genes in", setLabels[set]),
    xlab="", sub="", cex = 0.7);
dev.off();
```

# Plot to see what soft-thresholding powers would be appropriate

```{r}
# Choose a set of soft-thresholding powers
powers = c(seq(4,10,by=1), seq(12,20, by=2));
# Initialize a list to hold the results of scale-free analysis
powerTables = vector(mode = "list", length = nSets);
# Call the network topology analysis function for each set in turn
for (set in 1:nSets)
powerTables[[set]] = list(data = pickSoftThreshold(multiExpr[[set]]$data, powerVector=powers,
verbose = 2)[[2]]);
collectGarbage();
# Plot the results:
colors = c("black", "red", "blue", "green")
# Will plot these columns of the returned scale free analysis tables
plotCols = c(2,5,6,7)
colNames = c("Scale Free Topology Model Fit", "Mean connectivity", "Median connectivity",
"Max connectivity");
# Get the minima and maxima of the plotted points
ylim = matrix(NA, nrow = 2, ncol = 4);
for (set in 1:nSets) {
  for (col in 1:length(plotCols)) {
    ylim[1, col] = min(ylim[1, col], powerTables[[set]]$data[, plotCols[col]], na.rm = TRUE);
    ylim[2, col] = max(ylim[2, col], powerTables[[set]]$data[, plotCols[col]], na.rm = TRUE);
  }
}
# Plot the quantities in the chosen columns vs. the soft thresholding power
sizeGrWindow(8, 6)
pdf(file = "Plots/scaleFreeAnalysis.pdf", wi = 8, he = 6)
par(mfcol = c(2,2));
par(mar = c(4.2, 4.2 , 2.2, 0.5))
cex1 = 0.7;
for (col in 1:length(plotCols)) for (set in 1:nSets) {
  if (set==1) {
    plot(powerTables[[set]]$data[,1], -sign(powerTables[[set]]$data[,3])*powerTables[[set]]$data[,2],
        xlab="Soft Threshold (power)",ylab=colNames[col],type="n", ylim = ylim[, col],
        main = colNames[col]);
    addGrid();
  }
  if (col==1) {
    text(powerTables[[set]]$data[,1], -sign(powerTables[[set]]$data[,3])*powerTables[[set]]$data[,2],
        labels=powers,cex=cex1,col=colors[set]);
  } else
  text(powerTables[[set]]$data[,1], powerTables[[set]]$data[,plotCols[col]],
      labels=powers,cex=cex1,col=colors[set]);
  if (col==1) {
    legend("bottomright", legend = setLabels, col = colors, pch = 20) ;
  } else
    legend("topright", legend = setLabels, col = colors, pch = 20) ;
}
dev.off();
```

Leaf and Shoot do not have enough samples. Try just with flower and root

```{r}
multiExprFR = vector(mode = "list", length = 2)

multiExprFR[[1]] = list(data = as.data.frame(rootvoom))
names(multiExprFR[[1]]$data) = colnames(rootvoom)
rownames(multiExprFR[[1]]$data) = rownames(rootvoom)

multiExprFR[[2]] = list(data = as.data.frame(flowervoom))
names(multiExprFR[[2]]$data) = colnames(flowervoom)
rownames(multiExprFR[[2]]$data) = rownames(flowervoom)

save(multiExprFR, file = "multiExprFR.RData");
```

# Build the modules

```{r}
net = blockwiseConsensusModules(
        multiExprFR, power = 10, minModuleSize = 30, deepSplit = 2, 
        pamRespectsDendro = FALSE, 
        mergeCutHeight = 0.25, numericLabels = TRUE, 
        minKMEtoStay = 0,
        saveTOMs = TRUE, verbose = 5)

table(net$colors)
```

# Important info from previous step output

```{r}
consMEs = net$multiMEs
moduleLabels = net$colors
# Convert the numeric labels to color labels
moduleColors = labels2colors(moduleLabels)
consTree = net$dendrograms[[1]]
```

# Plot the first block only (~5000 genes)

```{r}
sizeGrWindow(8,6);
pdf(file = "Plots/ConsensusDendrogram-auto.pdf", wi = 8, he = 6)
plotDendroAndColors(dendro = consTree, colors = moduleColors[1:4999],
      groupLabels = "Module colors",
      dendroLabels = FALSE, hang = 0.03,
      addGuide = TRUE, guideHang = 0.05,
      main = "Consensus gene dendrogram and module colors")
dev.off()
```

# Save data for future use

```{r}
save(consMEs, moduleLabels, moduleColors, consTree, file = "Consensus-NetworkConstruction-auto.RData")
```