differential_abundance_limma.Rmd

---
title: "Differential Abundance"
author:
  - name: Lan Huong Nguyen
    affiliation: Institute for Computational and Mathematical Engineering, Stanford University, CA 94305
  - name: JA Grembi
    affiliation: Department of Civil & Environmental Engineering, Stanford University, CA 94305
keywords: microbiome, gPCA, PCA, clustering, differential abundance.
date: "`r BiocStyle::doc_date()`"
output: 
  BiocStyle::html_document:
    toc_float: true
    df_print: paged
vignette: >
  %\VignetteIndexEntry{diet microbiome}
  %\VignetteEngine{knitr::rmarkdown}
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE, warning = FALSE, message = FALSE, 
  fig.path = "../figs/diff_abund/cohort2/", #dev='pdf',
  fig.align = 'center', fig.wide = TRUE, 
  fig.width = 8, fig.height = 6
)

```

# Set up

```{r}
.packages <- c("gridExtra","RColorBrewer", 
               "tidyverse", "plyr",
               "phyloseq", "limma", "DESeq2")
sapply(.packages, require, character.only = TRUE)

options(stringsAsFactors = FALSE)
theme_set(theme_bw())
theme_update(
  text = element_text(size = 15),
  strip.background = element_rect(fill="grey97"))

theme_white_strip <- theme(
  text = element_text(size = 15), 
  strip.text = element_text(size = 11),
  strip.background = element_rect(fill="grey97")) 

rdbucols <- brewer.pal(9, "RdBu")[-(4:6)]
ncol_facets  <- 5
```


 # Functions
```{r, echo = FALSE}
# Modified voom using arcsinh transformations
source("voom_ihs.R")

# filter out taxa not present in at least 'thresh' distinct subjects.
filter_subject_prevalence <- function(ps, thresh = 2) {
  smdata <- data.frame(ps@sam_data)
  seqtab <- as(ps@otu_table, "matrix")
  
  subjects_prevalence <- sapply(1:ntaxa(ps), function(i) {
    subj.present <- unique(smdata[seqtab[i, ] > 0, "SubjectID"])
    length(unique(subj.present))
  })
  ps <- prune_taxa(rownames(seqtab)[subjects_prevalence >= thresh], ps)
  return(ps)
}

# tip agglomeration based on heirarchial clustering of the phylogenetic distance
clust_tip_glom <- function (physeq, h = 0.2, hcfun = cluster::agnes, ...) {
  dd <- as.dist(ape::cophenetic.phylo(phy_tree(physeq)))
  psclust <- cutree(as.hclust(hcfun(dd, ...)), h = h)
  return(psclust)
}

merge_tips <- function(physeq, psclust) {
  taxtab_collapse <- physeq@tax_table %>%
    as.data.frame() %>%
    rownames_to_column("SeqName") %>%
    select(SeqName, Genus, Species) %>%
    mutate(Tip = paste0("Tip", psclust)) %>%
    group_by(Tip) %>%
    mutate( 
      Org = paste(Genus, Species),
      Org = ifelse(is.na(Genus) & is.na(Species), NA, Org)
    ) %>%
    dplyr::summarise(
      OrgIncluded = paste0(unique(Org[!is.na(Org)]), collapse = "/"),
      SeqIncluded = paste0(SeqName[!is.na(SeqName)], collapse = "/"),
      Freq = n()
    ) %>%
    mutate(
      OrgIncluded = ifelse(OrgIncluded == "", "Unknown", OrgIncluded)
    )
  physeq@tax_table <- tax_table(
    cbind(Tip = paste0("Tip", psclust), physeq@tax_table[, 1:7]))
  freqtab <- data.frame(table(psclust))
  freqtab$Tip <-  paste0("Tip", freqtab$pclust)
  cliques <- freqtab %>% filter(Freq > 1) %>% 
    .[["psclust"]] %>% as.character()
  for (i in cliques) {
    physeq <- merge_taxa(physeq, 
                         eqtaxa = names(psclust)[psclust == i])
  }
  taxtab <- physeq@tax_table %>%
    as.data.frame() %>%
    left_join(taxtab_collapse) %>%
    column_to_rownames("Tip")
  physeq@tax_table <- tax_table(as(taxtab, "matrix"))
  taxa_names(physeq) <- rownames(taxtab)
  return(physeq)
} 

# limma DA
limma_fit <- function(seqtab, smpdata, 
                      dsgn, contrasts, sizefac, block, 
                      taxtable, alpha = 0.05){
  mm <- model.matrix(dsgn, smpdata)
  colnames(mm) <- make.names(colnames(mm))
  v <- voom_ihs(seqtab, mm, plot = FALSE, 
                lib.size = sizefac)
  corfit <- duplicateCorrelation(v, mm, block = block)
  v <- voom_ihs(seqtab, mm, plot = FALSE, block = block,
                correlation = corfit$consensus,
                lib.size = sizefac)
  fit <- lmFit(v, mm, block = block, correlation = corfit$consensus)
  fit.ebayes <- eBayes(fit)
  res <- lapply(contrasts, function(c) {
    tp <- topTable(fit.ebayes, adjust="BH", n = Inf, 
                  p.value = alpha, coef = c)
    tpdf <- NULL
    if(nrow(tp) > 0) {
      tpdf <- data.frame(
        SeqName = rownames(tp), 
        Contrast = c, tp) %>%
        left_join(taxtable)
    } 
    return(tpdf)
  })
  names(res) <- contrasts
  return(ldply(res, function(x) x, .id = "Contrast"))
}

#We can plot normalized and transformed counts to see the trends.
get_plot_data <- function(seqtab, smpdata, size_factors, limma_res){
  sf <- size_factors[colnames(seqtab)]
  cnts_trans <- asinh(sweep(seqtab, 2, sf, "/"))
  
  cnts_trans.long <- data.frame(cnts_trans) %>%
    rownames_to_column("SeqName") %>%
    gather(MeasID, count, -SeqName) %>%
    left_join(smpdata) 
  
  DF <- cnts_trans.long %>%
    filter(SeqName %in% limma_res$SeqName) %>%
    left_join(limma_res)
  DF$Include <- sapply(1:nrow(DF), function(i) 
    grepl(DF$Color[i], DF$Contrast[i]))
  DF <- DF %>%filter(Include)
  
  return(DF)
}


plot_percent <- function(DF, sbj_markersize = 3, smp_markersize = 1.5){
  subjDF <- DF %>%
    dplyr::select(
      SeqName, OrgName, logFC, adj.P.Val, 
      SubjectID, Percent.12Mo, Color, Diet, count) %>%
    group_by(SeqName, OrgName, logFC, adj.P.Val, SubjectID, 
             Percent.12Mo, Color, Diet) %>% 
    dplyr::summarise(count = mean(count))

  logfc_lim <- max(abs(DF$logFC))
  ggplot(
    DF,
    aes(x = Percent.12Mo, y = count)) +
    geom_point(
      color = "grey66",
      size = smp_markersize, alpha = 0.9
    ) +
    geom_point(
      data = subjDF,
      aes(color = logFC), 
      size = sbj_markersize, shape = 17
    ) +
    scale_color_gradientn(
        colors = colorRampPalette(brewer.pal(9, "RdBu")[-(4:6)])(100),
        limits = c(-logfc_lim, logfc_lim)
    ) + 
    ylab("transformed counts") +
    xlab("Percent weight loss at 12 months [%]") +
    facet_wrap(~ Diet + OrgName, scales = "free")
  }
```


# Data processing and filtering

## Load data
We chose to use the agglomerated dataset because testing differential abundance of specific sequence variants doesn't seem to be a biologically plausible important outcome bcause we don't expect to see many sequence variants present in a large number of people.  We will include the sequence-level analysis in the supplementary information.


```{r}
(ps <- readRDS("../data/phyloseq/diet_participants_with_adherence.rds"))
taxtable <- data.frame(ps@tax_table)  %>%
  dplyr::select(-Seq) %>%
  rownames_to_column("SeqName")
table(ps@sam_data$Cohort, useNA = "always")

```


Comment: there are 28 samples without cohort assignnment and any subject data.

```{r}
minSampleDepth <- 10000
ps <- subset_samples(ps, sample_sums(ps) > minSampleDepth)
(ps <- subset_taxa(ps, taxa_sums(ps) > 0))

```


## Tip agglomerated data

```{r, eval = FALSE}
h <- 0.1
# Tip agglomerated physeq
psclust <- clust_tip_glom(ps, h = h)
pstips <- merge_tips(ps, psclust)
# [ 989 taxa and 491 samples ]
saveRDS(pstips, "../data/phyloseq/diet_participants_tipglom_with_adherence.rds")
```


```{r, echo = FALSE}
pstips <- readRDS("../data/phyloseq/diet_participants_tipglom_with_adherence.rds")
pstips
```

```{r}
# There was something wring with the samples as they look like outliers for size factor estimation
pstips <- subset_samples(pstips, !(SubjectID == "1139" & Timepoint == "10 Week"))
(pstips <- subset_taxa(pstips, taxa_sums(pstips) > 0))
```

## Estimate size factors

```{r}
ddstips <- phyloseq_to_deseq2(pstips, design = ~ 1)
ddstips <- estimateSizeFactors(ddstips, type = "poscount")
size_factors_tips <- sizeFactors(ddstips)
summary(size_factors_tips)

# Since we do not want to artificially inflate the counts in voom transformation
# we normalize the size factors so the the smallest is equal to 1.
size_factors_tips <- size_factors_tips/min(size_factors_tips)
summary(size_factors_tips)
```


```{r}
# ggplot(data.frame(depth = sample_sums(pstips),
#                   Size_Factor = size_factors,
#                   Subject = pstips@sam_data$SubjectID,
#                   Sample = pstips@sam_data$SampleID)) +
#   geom_text(aes(x = depth, Size_Factor, label = Subject))
```

## Filter samples and sparse tips.

```{r}
# Filter samples
(pstips1 <- subset_samples(pstips, Cohort == 2))

(pstips1 <- subset_samples(pstips1, Timepoint == "Baseline"))

# Filter out any samples that do not have all weights
(pstips1 <- subset_samples(pstips1, !is.na(Cat.12Mo)))

(pstips1 <- subset_samples(pstips1, sample_sums(pstips1) > minSampleDepth))

# Filter sequences
(pstips1 <- subset_taxa(pstips1, taxa_sums(pstips1) > 0))

# Filter sequences
minSubjPrev <- .1*length(unique(pstips1@sam_data$SubjectID))
# Filter out rare taxa not present in at least 10% of subjects
(pstips1 <- filter_subject_prevalence(pstips1, thresh = minSubjPrev))

seqtabtip <- as(pstips1@otu_table, "matrix")
smpdata <- data.frame(pstips1@sam_data) 
smpdata$Cat.12Mo <- factor(
  smpdata$Cat.12Mo, levels = c("US", "MS", "VS"))

taxtabletip <- data.frame(pstips@tax_table)  %>%
  dplyr::select(Kingdom:Species) %>%
  rownames_to_column("SeqName") %>%
  mutate(OrgName = SeqName)
```


# Differential abundance with voom

We apply limma + voom method on data for participants in Cohort 2.
We choose limma because our datasets has replicates, and limma is 
the only differential abundance testing package which can estimate
within subject random effect. Also, we have more than a dozen
samples available and there is no need for very sensitive methods
as DESeq2 designed for datasets with very few number of samples.
Using limma+voom was actually the advice we received from a developer of DESeq2,
Michael Love.

## Percent Test 
Taxa differentially abundant across Percent weight-loss at 12Mo (Percent.12Mo)

```{r}
dsgnPercent <- ~ 0 + Color + Color:Percent.12Mo + Lane 
contrastPercent <- c("ColorBlue.Percent.12Mo", "ColorPurple.Percent.12Mo")

limmaPercentTip <- limma_fit(
  seqtabtip, smpdata, dsgnPercent, 
  contrasts = contrastPercent,
  sizefac = size_factors_tips[rownames(smpdata)], 
  block = smpdata$SubjectID, 
  taxtable = taxtabletip,
  alpha = 1)  %>%
  left_join(data.frame(pstips1@tax_table) %>%
              rownames_to_column("SeqName") %>%
              dplyr::select(SeqName, SeqIncluded, OrgIncluded))
# write.csv(limmaPercentTip,
#           file = "../results/limma/limma_cohort2_tip_percent_all.csv")
```


```{r}
alpha <- 0.05
limmaPercentTip <- limmaPercentTip %>%
  filter(adj.P.Val < alpha)

table(limmaPercentTip$Contrast)

# write.csv(limmaPercentTip,
#           file = "../results/limma/limma_cohort2_tip_percent.csv")
```

```{r}
limmaPercentTip
```

```{r limma-percent-tip, fig.width=8, fig.height=4}
DF <- get_plot_data(seqtabtip, smpdata, size_factors_tips, limmaPercentTip)
DF$OrgName <- gsub("Tip", "Cluster", DF$OrgName)
plot_percent(DF) + 
  theme(text = element_text(size = 15),
        strip.text = element_text(size = 11)) + labs(y = "Normalized ASV-cluster count")
```

## Categorical Test
Taxa differentially abundant across weight-loss categories at 12Mo (Cat.12Mo), specifically comparing the VS with the US groups.

```{r}
smpdata$Cat.12Mo <- factor(
  smpdata$Cat.12Mo, levels = c("US", "MS", "VS"))

dsgnCat <- ~ 0 + Color + Color:Cat.12Mo + Lane
contrastsCat <-  c(
  "ColorBlue.Cat.12MoMS", "ColorPurple.Cat.12MoMS",
  "ColorBlue.Cat.12MoVS", "ColorPurple.Cat.12MoVS")


limmaCatTip <- limma_fit(
  seqtabtip, smpdata, 
  dsgn = dsgnCat, 
  contrasts = contrastsCat,
  sizefac = size_factors_tips[rownames(smpdata)], 
  block = smpdata$SubjectID, 
  taxtable = taxtabletip,
  alpha = 1) %>%
  left_join(data.frame(pstips1@tax_table) %>%
              rownames_to_column("SeqName") %>%
              dplyr::select(SeqName, SeqIncluded, OrgIncluded))

# write.csv(limmaCatTip,
#           file = "../results/limma/limma_cohort2_tip_categorical_all.csv")
```


```{r}
limmaCatTip <- limmaCatTip %>%
  filter(adj.P.Val < alpha)
table(limmaCatTip$Contrast)

limmaTip <- rbind(limmaPercentTip, limmaCatTip)

# write.csv(limmaCatTip,
#           file = "../results/limma/limma_cohort2_tip_categorical.csv")
# write.csv(limmaTip, file = "../results/limma/limma_cohort2_tip.csv")
```

```{r}
limmaCatTip_fltr <- limmaCatTip %>% 
    filter(grepl("VS", Contrast)) %>%
    arrange(logFC)
limmaCatTip_fltr
```


```{r limma-cat-tip, fig.width=10, fig.height=4}
limmaCatTip_fltr$OrgName <- gsub(
  "Tip", "Cluster", limmaCatTip_fltr$OrgName
)
DF <- get_plot_data(
  seqtabtip, smpdata, size_factors_tips, limmaCatTip_fltr) 
plot_percent(DF) + labs(y = "Normalized ASV-cluster count")
```


# Supplementary materials

## Sequence-level analysis

### Estimate size factors

```{r}
# estimate size factors
dds <- phyloseq_to_deseq2(ps, design = ~ 1)
dds <- estimateSizeFactors(dds, type = "poscount")
size_factors <- sizeFactors(dds)
summary(size_factors)

# Since we do not want to artificially inflate the counts in voom transformation
# we normalize the size factors so the the smallest is equal to 1.
size_factors <- size_factors/min(size_factors)
summary(size_factors)
```

```{r}
ggplot(data.frame(depth = sample_sums(ps),
                  Size_Factor = size_factors,
                  Subject = ps@sam_data$SubjectID,
                  Sample = ps@sam_data$SampleID)) +
  geom_text(aes(x = depth, Size_Factor, label = Subject))

```

```{r}
data.frame(ps@sam_data)[size_factors > 20, ]
```

```{r}
ps <- subset_samples(ps, !(SubjectID == "1139" & Timepoint == "10 Week"))
(ps <- subset_taxa(ps, taxa_sums(ps) > 0))
```


```{r}
# estimate size factors
dds <- phyloseq_to_deseq2(ps, design = ~ 1)
dds <- estimateSizeFactors(dds, type = "poscount")
size_factors <- sizeFactors(dds)
summary(size_factors)

# Since we do not want to artificially inflate the counts in voom transformation
# we normalize the size factors so the the smallest is equal to 1.
size_factors <- size_factors/min(size_factors)
summary(size_factors)
```

```{r}
ggplot(data.frame(depth = sample_sums(ps),
                  Size_Factor = size_factors,
                  Subject = ps@sam_data$SubjectID,
                  Sample = ps@sam_data$SampleID)) +
  geom_text(aes(x = depth, Size_Factor, label = Subject))

```


### Filter samples and sequences

We first analyze the data from Cohort 2, and leave Cohort 3 for validation.

```{r}
# Filter samples
(ps1 <- subset_samples(ps, Cohort == 2))

(ps1 <- subset_samples(ps1, Timepoint == "Baseline"))

(ps1 <- subset_samples(ps1, !is.na(Cat.12Mo) & Cat.12Mo != "NA"))


# Filter out rare taxa present in less than 10% of all participants
(ps1 <- filter_subject_prevalence(ps1, thresh = minSubjPrev))

seqtab <- as(ps1@otu_table, "matrix")
smpdata <- data.frame(ps1@sam_data) 
smpdata$Percent.12Mo <- as.numeric(smpdata$Percent.12Mo)

taxtable <- data.frame(ps1@tax_table)  %>%
  dplyr::select(Kingdom:Species, OrgName) %>%
  rownames_to_column("SeqName") 
```


### Percent Test

```{r}
limmaPercent <- limma_fit(
  seqtab, smpdata, dsgnPercent, 
  contrasts = contrastPercent,
  sizefac = size_factors[rownames(smpdata)], 
  block = smpdata$SubjectID, 
  taxtable = taxtable,
  alpha = 1)

# write.csv(limmaPercent, 
#           file = "../results/limma/limma_cohort2_seq_percent_all.csv")

```


```{r}
limmaPercent <- limmaPercent %>%
  filter(adj.P.Val < alpha)

table(limmaPercent$Contrast)
```


```{r}
limmaPercent
```

```{r}
# write.csv(limmaPercent, 
#           file = "../results/limma/limma_cohort2_seq_percent.csv")
```


```{r limma-percent-seq, fig.width=10, fig.height=4}
##No data in the filtered DF so we get an error with this code

# DF <- get_plot_data(seqtab, smpdata, size_factors, limmaPercent)
# plot_percent(DF)

```

### Categorical Test

```{r}
smpdata$Cat.12Mo <- factor(
  smpdata$Cat.12Mo, levels = c("US", "MS", "VS"))

limmaCat <- limma_fit(
  seqtab, smpdata,
  dsgn = dsgnCat, 
  contrasts = contrastsCat,
  sizefac = size_factors[rownames(smpdata)], 
  block = smpdata$SubjectID, 
  taxtable = taxtable,
  alpha = 1)

# write.csv(limmaCat, 
#           file = "../results/limma/limma_cohort2_seq_categorical_all.csv")
```


```{r}

limmaCat <- limmaCat %>% 
  filter(adj.P.Val < alpha)
table(limmaCat$Contrast)

# write.csv(limmaCat, 
#           file = "../results/limma/limma_cohort2_seq_categorical.csv")
# 
# limmaSeq <- rbind(limmaPercent, limmaCat)
# write.csv(limmaSeq, file = "../results/limma/limma_cohort2_seq.csv")
```

```{r}
limmaCat <- limmaCat %>%
  arrange(logFC)
limmaCat # %>% filter(grepl("VS", Contrast))   #no need to filter as only VS
```


```{r limma-cat-seq, fig.width=8, fig.height=6}
DF <- get_plot_data(
  seqtab, smpdata, size_factors, 
  limmaCat %>% filter(grepl("VS", Contrast))) #%>% mutate(OrgName = factor(OrgName, levels = limmaCat$OrgName))
plot_percent(DF) +
  facet_wrap(~ Diet + OrgName) +
  theme(text = element_text(size = 15),
        strip.text = element_text(size = 11)) + labs(y = "Normalized ASV count")
```


```{r}
##########################################
##########################################  Lan don't delete this!  
## I want to add it in here because I used this code to get info that I have included in the differential abundance section write-up in the manuscript, so I wanted to keep it together with the rest of the differential abundance stuff.  I just need to modify it a bit to get it working in this notebook (ensure the data.frame names are the smae, etc) and haven't had time to do that yet.
##########################################
# tax.table.blue <- as(bl.psasinh.blue@tax_table, "matrix")
# otu.table.blue <- as(bl.psasinh.blue@otu_table, "matrix")
# ##Number of samples that each taxa is found in
# numSamples.byTaxa <- rowSums(otu.table.blue != 0)
# 
# #Looking at Ruminococcaceae_UCG-002
# Rumino_002 <- data.frame(tax.table.blue) %>%
#   filter(Genus == "Ruminococcaceae_UCG-002") %>%
#   mutate(Seqs = gsub(" ", "", gsub(":", "", substr(OrgName,1,7))))
# Rumino_002.otu <- data.frame(otu.table.blue) %>%
#   rownames_to_column() %>%
#   filter(rowname %in% Rumino_002$Seqs)
# Rumino_002.otu.numSamples.byTaxa <- rowSums(Rumino_002.otu != 0)
# names(Rumino_002.otu.numSamples.byTaxa) <- Rumino_002.otu$rowname
# Rumino_002.otu.numSamples.byTaxa
# 
# Rumino_013 <- data.frame(tax.table.blue) %>%
#   filter(Genus == "Ruminococcaceae_UCG-013") %>%
#   mutate(Seqs = gsub(" ", "", gsub(":", "", substr(OrgName,1,7))))
# Rumino_013.otu <- data.frame(otu.table.blue) %>%
#   rownames_to_column() %>%
#   filter(rowname %in% Rumino_013$Seqs)
# Rumino_013.otu.numSamples.byTaxa <- rowSums(Rumino_013.otu != 0)
# names(Rumino_013.otu.numSamples.byTaxa) <- Rumino_013.otu$rowname

```