taxa-summary.rmd

---
title: "Speeding-up-science-metatranscriptomics-taxa-summary"
author: "Zeya Xue"
date: "5/9/2019"
output: rmarkdown::github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.path = "figs/")
```

#  Speeding up science metatranscriptomics taxa summary
* Written by Zhengyao "Zeya" Xue, [ORCID](https://orcid.org/0000-0002-4930-8212) 
* The data files and R scripts can be found in this [GitHub repo](https://github.com/zeyaxue/speeding-up-science-binder)
* [Launch Binder](https://mybinder.org/v2/gh/zeyaxue/speeding-up-science-binder/master?urlpath=rstudio)
* [html version](https://github.com/zeyaxue/speeding-up-science-binder/blob/master/taxa-summary.md)

* Thumbnail of expected Heatmap  

```{r echo=FALSE, out.width='50%'}
knitr::include_graphics("https://raw.githubusercontent.com/zeyaxue/speeding-up-science-binder/master/figs/taxa_heat_thumb.png")
```

* Thumbnail of expected bar plot   

```{r echo=FALSE, out.width='50%'}
knitr::include_graphics("https://raw.githubusercontent.com/zeyaxue/speeding-up-science-binder/master/figs/unnamed-chunk-8-1.png")
```
 
* P.S. The demonstration shown here is using data from [this paper](https://aem.asm.org/content/84/1/e02026-17.short)

## Introduction
The starting point of the workflow is 
  + A count table, normalized or not. Looks like this:
```{r echo=FALSE}
TabTPM <- read.table(file.path("example_data/sample_TPM.tsv"),
                     header = TRUE, sep = "\t")
head(TabTPM)
```
  + A annotation or taxonomy table 
```{r echo=FALSE}
Tabanno <- read.table(file.path("example_data/sample_annotation_classifications.tsv"),
                      header = TRUE, sep = "\t", na.strings = "<NA>")
head(Tabanno)
```
  + A sample metainfo table 
```{r echo=FALSE, warning=FALSE}
samdf <- read.csv(file.path("example_data/Samdf.csv"))
head(samdf)
```


## Load packages and setting up
```{r message=FALSE, warning=FALSE}
library(phyloseq);packageVersion("phyloseq")
library(DESeq2);packageVersion("DESeq2")
library(ggplot2)
library(reshape2)
library(superheat)
library(plyr)
library(dplyr)
library(RColorBrewer)
```

## Import files to create phyloseq object
```{r}
# The otu table slot of phyloseq object 
TabTPM <- read.table(file.path("example_data/sample_TPM.tsv"),
                     header = TRUE, sep = "\t")
row.names(TabTPM) <- TabTPM$CDS_ID
TabTPM <- TabTPM[,-1] # remove CDS_ID column. User defined.
TabTPM <- as.matrix.data.frame(TabTPM)

# The tax table slot of phyloseq object
Tabanno <- read.table(file.path("example_data/sample_annotation_classifications.tsv"),
                      header = TRUE, sep = "\t", na.strings = "<NA>")
rownames(Tabanno) <- Tabanno$CDS_ID
Tabanno <- Tabanno[,c(-1,-2)] # remove CDS_ID and KOID columns. User defined.
Tabanno <- as.matrix.data.frame(Tabanno)

# The sample data slot of phyloseq object
samdf <- read.csv(file.path("example_data/Samdf.csv"))
rownames(samdf) <- samdf$SampleID

ps <- phyloseq(otu_table(TabTPM, taxa_are_rows = TRUE), 
               tax_table(Tabanno), sample_data(samdf))
ps # 20000 taxa and 4 samples 
```

## Optional taxonomy level clean up
```{r warning=FALSE}
# Define function to get the deepest taxa assignment level
RECps <- function(ps) {
  TaxTab2 <- as.data.frame(ps@tax_table)
  
  list.s = as.character(TaxTab2$Species)
  list.g = as.character(TaxTab2$Genus)
  list.f = as.character(TaxTab2$Family)
  list.o = as.character(TaxTab2$Order)
  list.c = as.character(TaxTab2$Class)
  list.p = as.character(TaxTab2$Phylum)
  list.k = as.character(TaxTab2$Kingdom)
  list.d = as.character(TaxTab2$Domain)
  list.REC = character(length(as.character(TaxTab2$Domain)))
  
  for(i in 1:dim(TaxTab2)[1]){
    S = which(TaxTab2$Species[i] == "" | is.na(TaxTab2$Species[i]))
    G = which(TaxTab2$Genus[i] == "" | is.na(TaxTab2$Genus[i]))
    Fa = which(TaxTab2$Family[i] == "" | is.na(TaxTab2$Family[i]))
    O = which(TaxTab2$Order[i] == "" | is.na(TaxTab2$Order[i]))
    C = which(TaxTab2$Class[i] == "" | is.na(TaxTab2$Class[i]))
    P = which(TaxTab2$Phylum[i] == "" | is.na(TaxTab2$Phylum[i]))
    K = which(TaxTab2$Kingdom[i] == "" | is.na(TaxTab2$Kingdom[i]))
    D = which(TaxTab2$Domain[i] == "" | is.na(TaxTab2$Domain[i]))
    if(length(S) == 0){
      list.REC[i] <- list.s[i]
    } else if(length(G) == 0){
      list.REC[i] <- list.g[i]
    } else if(length(Fa) == 0){
      list.REC[i] <- list.f[i]
    } else if(length(O) == 0){
      list.REC[i] <- list.o[i]
    } else if(length(C) == 0){
      list.REC[i] <- list.c[i]
    } else if(length(P) == 0){
      list.REC[i] <- list.p[i]
    } else if(length(K) == 0){
      list.REC[i] <- list.k[i]
    } else if(length(D) == 0){
      list.REC[i] <- list.d[i]
    } else {
      list.REC[i] <- "meow"
    }
  }
  
  TaxTab2$REC <- list.REC
  TaxTab2$REC <- factor(TaxTab2$REC)
  phyloseq(otu_table(ps), sample_data(ps),
           TaxTab2 %>% as.matrix() %>% tax_table())
}

ps.REC <- RECps(ps)
ps.REC # 20000 taxa and 4 samples 
```


## Heat map 
```{r, fig.height = 13, fig.width = 10}
# Clean up the taxonomy 
ps.REC.glom <- ps.REC %>% tax_glom(taxrank = "REC", NArm = FALSE)
# Run the next line if want relative abundance 
ps.REC.per <- ps.REC.glom %>% transform_sample_counts(function(x) x/sum(x) )  
taxa.df <- psmelt(ps.REC.per)  # melt ps object 
# aggregate for REC level plot
taxa.agg <- aggregate(Abundance ~ REC + SampleID,
                      data = taxa.df,
                      mean)
taxa.cast <- dcast(taxa.agg, REC ~ SampleID, mean, value.var = "Abundance")

# Define palette 
my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)
# defines the color breaks manually for a "skewed" color transition
col_breaks = c(seq(-1,0,length=100),      # for red
               seq(0.01,0.8,length=100),  # for yellow
               seq(0.81,1,length=100))    # for green

# only plot the top 30 most abundant taxa 
# need to change results from factor to numeric because of R
row.names(taxa.cast) <- taxa.cast$REC
taxa.cast <- taxa.cast[, -1]
indx <- sapply(taxa.cast, is.factor)
taxa.cast[indx] <- lapply(taxa.cast[indx], function(x) as.numeric(as.character(x))) 
taxa.cast30 <- cbind(taxa.cast, total = rowSums(taxa.cast)) #  need numeric values
taxa.cast30$taxa <- rownames(taxa.cast30)
taxa.cast30 <- head(arrange(taxa.cast30,desc(total)), n = 30)
row.names(taxa.cast30) <- taxa.cast30$taxa
taxa.cast30 <- taxa.cast30[, -c(5,6)] # remove total and taxa name colums
  
superheat(taxa.cast30,
          # retain original order of rows/cols
          pretty.order.rows = TRUE,
          pretty.order.cols = TRUE,
          row.dendrogram = TRUE,
          col.dendrogram = TRUE,
          grid.hline = TRUE,
          row.title = "Annotation",
          column.title = "SampleID",
          left.label.text.size = 4,
          bottom.label.text.size = 5,
          left.label.size = 0.5,
          # change the grid color to white (more pretty on a dark background)
          grid.hline.col = "white",
          grid.vline.col = "white") 

```


## Stack bar plot 
```{r}
# Clean up the taxonomy 
ps.REC.glom <- ps.REC %>% tax_glom(taxrank = "REC", NArm = FALSE)
# Run the next line if want relative abundance 
ps.REC.per <- ps.REC.glom %>% transform_sample_counts(function(x) x/sum(x) )  
taxa.df <- psmelt(ps.REC.per)  # melt ps object 
# aggregate for REC level plot
taxa.agg <- aggregate(Abundance ~ REC + SampleID,
                      data = taxa.df,
                      mean)

# Get the names of the most abundant 15 taxa 
ps.Notop15 <- prune_taxa(names(sort(taxa_sums(ps.REC.per), TRUE)[16:nrow(ps.REC.per@tax_table)]), ps.REC.per)
taxa_names_filt <- ps.Notop15@tax_table[,8] %>% as.character() # 8 for REC level 
# convert REC colum to a character vector from a factor because R
taxa.agg$REC <- as.character(taxa.agg$REC)
# change the less abundant taxa names to "Other"
taxa.agg[taxa.agg$REC %in% taxa_names_filt,]$REC <- "Other"

# Set colors for plotting
mycol = colorRampPalette(brewer.pal(12, "Paired"))(16)

# Set levels of taxon for pretty plots 
## I do not know this beforehand, modified after 1st generating plot to know the 
## taxa names
taxa.agg$REC = factor(taxa.agg$REC, levels = c("Alteromonas_macleodii",
                                               "Anaerophaga_thermohalophila",
                                               "Aureispira_sp._CCB-QB1",
                                               "Bacteroides_fragilis",
                                               "Dyadobacter_alkalitolerans",
                                               "Escherichia_coli",
                                               "Haliscomenobacter_hydrossis",
                                               "Lacinutrix_himadriensis",
                                               "Lewinella_cohaerens",
                                               "Nitrosomonas_communis",
                                               "Phaeodactylibacter_xiamenensis",
                                               "Salinibacter_ruber",
                                               "Saprospira_grandis",
                                                "Synechococcus_sp._BL107",
                                                "Synechococcus_sp._CC9605",
                                                "Other"))

ggplot(taxa.agg, aes(x = SampleID, y = Abundance, fill = REC)) + 
  geom_bar(stat = "identity") +  #position = "fill" is for making the bar 100% 
  scale_fill_manual(values = mycol)+
  theme(axis.title.x = element_blank()) +   # Remove x axis title
  guides(fill = guide_legend(reverse = FALSE, keywidth = 1, keyheight = 1)) +
  ylab("Relative Abundance \n")

```