forked from megaptera-helvetiae/MAGrepresentation-Binder
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalysis.Rmd
193 lines (157 loc) · 6.04 KB
/
analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
---
title: "MAG Representation"
author: "Marian L Schmidt & Ian Morelan"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
code_folding: show
highlight: default
keep_md: no
theme: journal
toc: yes
toc_float:
collapsed: no
smooth_scroll: yes
toc_depth: 3
editor_options:
chunk_output_type: console
---
# Setup
```{r setup, include=FALSE}
knitr::opts_chunk$set(eval = TRUE,
echo = TRUE,
cache = FALSE,
include = TRUE,
warning = FALSE,
collapse = FALSE,
message = FALSE,
engine = "R", # Chunks will always have R code, unless noted
error = TRUE,
fig.height = 7, fig.width = 7,
fig.path="./Figures/") # Set the figure options
```
# Load Packages & Functions
```{r load-packages-funs}
# Load functions and packages that are necessary
source("functions.R")
library(plotly)
# We'd like to plot with pretty colors based on national park posters :)
# install.packages("devtools")
#devtools::install_github("katiejolly/nationalparkcolors")
library(nationalparkcolors)
# Assign the 4 colors for the 4 different depths
colors4 <- park_palette("Saguaro", 4)
```
# Load Data
```{r import-data}
# Import the tax data
tax_physeq <- import_gtdbtk_taxonomy_and_checkm(
taxonomy_filename = "Tara_Oceans_Med/TOBG-MED-READCOUNTMATCH.bac120.tsv",
checkm_filename = "Tara_Oceans_Med/TOBG-MED_qa.txt")
# Import the metadata
meta_physeq <- import_metadata(province_filename = "Tara_Oceans_Med/Sample-Province.tsv",
sizeFraction_filename = "Tara_Oceans_Med/Sample-SizeFraction.tsv") %>%
mutate(names = rownames(.)) %>%
separate(., col = names, into = c("station", "fraction", "depth"), sep = "_") %>%
mutate(r_names = paste(station, fraction, depth, sep = "_")) %>%
column_to_rownames(var = "r_names") %>%
dplyr::select(-c(station, fraction)) %>%
sample_data()
# Import the readcounts
mag_table <- import_readcounts(readcounts_filename = "Tara_Oceans_Med/TOBG-MED-TOTAL.readcounts")
# Import the tree
mag_tree <- import_tree(tree_filename = "Tara_Oceans_Med/GToTree_output.newick")
# Put into phyloseq object
tara_physeq <- phyloseq(meta_physeq, tax_physeq, mag_table, mag_tree)
```
# Normalization
```{r data-normalized, fig.height=10, fig.width=12}
# Normalizing the matrix
# Dividing rows by the genome size for the bin
# Genome size is different from Bin size
# Genome size is BinSize * BinCompletion
# Read in checkm data to calculate the expected
checkm <- read.csv("Tara_Oceans_Med/TOBG-MED_qa.txt", sep = "\t", as.is = TRUE) %>%
dplyr::select("Bin.Id", "Completeness", "Genome.size..bp.") %>%
dplyr::rename(est_genome_size = "Genome.size..bp.") %>%
# Make a new column for the expected genome size based on est_genome_size * completeness
mutate(exp_genome_size = round(est_genome_size/(Completeness/100)))
ordered_checkm <- data.frame(Bin.Id = rownames(mag_table)) %>%
left_join(checkm, by = "Bin.Id")
#Sanity check
stopifnot(rownames(mag_table) == ordered_checkm$Bin.Id)
# Do the normalization
t_mat <- t(as.matrix(mag_table))
# divide the matrix columns by the genome size of each MAG
norm_mat <- t_mat/ordered_checkm$exp_genome_size
t_norm_mat <- t(norm_mat)
# Combine into a normalized phyloseq object
tara_norm_physeq <- phyloseq(meta_physeq, tax_physeq, mag_tree,
otu_table(t_norm_mat, taxa_are_rows = TRUE))
```
## Heatmap
```{r heatmap, fig.height=8, fig.width=10}
# A clustered heatmap
heatmap(otu_table(tara_norm_physeq))
# Visualize only the top 50 taxa
top_50MAGs <- names(sort(taxa_sums(tara_norm_physeq), decreasing = TRUE))[1:50]
top_50MAGs_physeq <- prune_taxa(top_50MAGs, tara_norm_physeq)
# Subset only the bacterial samples
girus_top50MAGs <- subset_samples(top_50MAGs_physeq, size_fraction =="girus")
# Melt into long format and fix zeros to avoid infinity values
otu_long_50_girus <- psmelt(otu_table(girus_top50MAGs)) %>%
mutate(log_abund = log2(Abundance + 0.0000001))
# Make a "heatmap" with geom_tile that works with plotly :)
heat_plot <- otu_long_50_girus %>%
ggplot(aes(x=Sample, y=OTU)) +
geom_tile(aes(fill = log_abund)) +
scale_fill_distiller(palette = "YlGnBu") +
labs(title = "Top 50 MAGs") +
theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1))
# Plot the plotly plot!
ggplotly(heat_plot)
```
## Beta diversity plots
```{r betadiv, fig.width = 9}
# Calculate the bray curtis distances for all samples
tara_norm.ord <- ordinate(tara_norm_physeq, method = "PCoA", distance = "bray")
# Make a faceted plot by size fraction
sizeFrac_PCoA_prov <-
plot_ordination(tara_norm_physeq, tara_norm.ord, color = "province") +
facet_wrap(~size_fraction) +
theme_minimal() +
scale_color_brewer(palette = "Paired")
# Make it a plotly plot
ggplotly(sizeFrac_PCoA_prov)
```
```{r betadiv2, fig.height = 4, fig.width=5}
# Subset the bacterial samples and color by depth
bact <- subset_samples(tara_norm_physeq, size_fraction == "bact")
# Calculate the bray curtis distances for a PCoA
bact.ord <- ordinate(bact, method = "PCoA", distance = "bray")
# Make the plot
bact_PCoA_depth <- plot_ordination(bact, bact.ord, color = "depth")+
theme_minimal() +
geom_point(size = 2) +
scale_color_manual(values = colors4)
# Make it a plotly plot
ggplotly(bact_PCoA_depth)
```
## Abundance plots
```{r abundance plots}
# Select the top 4 taxa
top_taxa <- names(sort(taxa_sums(bact),decreasing = TRUE))[1:4]
top_bact <- prune_taxa(top_taxa, bact)
# Melt the data to the long format
top_bact_df <- psmelt(top_bact)
# Plot it! :)
abundplot <- ggplot(top_bact_df, aes(x=province, y= Abundance, color = depth)) +
facet_wrap(~OTU, scales = "free_y") +
geom_jitter(size = 2) + theme_minimal()+
labs(y = "Normalized MAG Abundance") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
axis.title.x = element_blank()) +
scale_color_manual(values = colors4)
# Make it interactive
ggplotly(abundplot)
```