-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTotalRNA_phyloFlash_NTUabundance_to_phyloseq.R
119 lines (91 loc) · 4.16 KB
/
TotalRNA_phyloFlash_NTUabundance_to_phyloseq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Analysis microbial communities from total RNA data processed using phyloFlash.
# Load libraries
library(dplyr)
library(microbiome)
library(microViz)
library(viridis)
library(vegan)
library(here)
# Create scale function to normalize library sizes
scale_reads <- function(physeq, n) {
physeq.scale <-
transform_sample_counts(physeq, function(x) {
(n * x/sum(x))
})
otu_table(physeq.scale) <- floor(otu_table(physeq.scale))
physeq.scale <- prune_taxa(taxa_sums(physeq.scale) > 0, physeq.scale)
return(physeq.scale)
}
# Make a list of paths of all .csv files from the phyloFlash output (*NTUabundance.csv)
all_paths <-list.files(path = "path/to/phyloFlash/TotalRNA/NTUabundance/files", pattern = "*.csv",full.names = TRUE)
#import all files
all_content <-lapply(all_paths, read.table,sep = ",", encoding = "UTF-8")
#combine all files
NTUtable <- Reduce(function(x, y) merge(x, y, by = 'V1', all = TRUE), all_content)
#get list of filenames and subtract path and extentions to get sample names
all_filenames <- all_paths %>% basename() %>% as.list()
all_filenames <-sub('\\.phyloFlash.NTUabundance.csv$', '', all_filenames)
# add entry for first column and replace column names with sample names
all_filenames <- c(L1 = "",all_filenames)
colnames(NTUtable)<-all_filenames
#create tax-table
path_split <- strsplit(NTUtable[,1], ";")
silva <- read.table("tax_slv_ssu_138.1.txt", h = F, sep = "\t", stringsAsFactors = F) # import taxa map from version used to annotate with phyloFLash
silva_map <- data.frame( # prepare taxa map in right format for parsing function
path = gsub(";$", "", silva$V1),
node = sapply(strsplit(silva$V1, ";"), function(x) x[length(x)]),
rank = silva$V3,
stringsAsFactors = T
)
# Create new function for SILVA taxonomy
SILVAtaxopath <- function(tax, SILVA){ # parsing function provided by Christiane Hassenrück @chassenr on github
output <- matrix(NA, nrow = length(tax), ncol = length(levels(SILVA$rank)))
colnames(output) <- levels(SILVA$rank)
for (i in 1:length(tax)) {
for (j in 1:length(levels(SILVA$rank))) {
if (paste(tax[[i]][1:j], collapse = ";") %in% SILVA$path) {
output[i, as.character(SILVA[SILVA$path == paste(tax[[i]][1:j], collapse = ";"), "rank"])] <- as.character(SILVA[SILVA$path == paste(tax[[i]][1:j], collapse = ";"), "node"])
}
}
}
return(output)
}
# Use function to organize taxonomic calls
TAXmat <- SILVAtaxopath(path_split,silva_map)
prefix<- "NTU" #create rownames corresponding to NTUs
suffix<- seq(1:nrow(NTUtable))
NTU.names<- paste(prefix,suffix, sep = "")
row.names(TAXmat)<-NTU.names
TAXmat <- TAXmat[,c("domain","major_clade","kingdom","phylum","class","order","family","genus")]
# create phyloseq tax table
TAX<-tax_table(TAXmat)
#create OTU matrix for phyloseq import
df<-NTUtable[,-1]
sample_prefix<-"IS19"
colnames(df)<-paste(sample_prefix,colnames(df),sep = "_")
colnames(df)[16:18]
colnames(df)[16:18]<-c("Field-BL","Lab-BL","NEG")
rownames(df)<-NTU.names
df[is.na(df)] <- 0
NTUmat<-as.matrix(df)
NTU<-otu_table(NTUmat, taxa_are_rows = TRUE)
# create sample data for phyloseq
SAMPLE_table<-read.csv(file = "path/to/SAMPLE_data.csv", header = TRUE, row.names = 1,sep = ",")
SAMPLE_DATA<-sample_data(SAMPLE_table)
# Check OTU table names vs mapping file (sample_data) names (if they don't match, samples won't be included in final phyloseq object).
all(colnames(NTU) %in% rownames(SAMPLE_DATA))
# Create phyloseq object using the created tables
TotalRNA<-phyloseq(NTU,TAX,SAMPLE_DATA)
# fix taxa table to replace unknowns etc. (using microViz package)
TotalRNA_fixed<-tax_fix(TotalRNA, min_length = 4)
TotalRNA_fixed<- TotalRNA_fixed %>%
tax_fix(
min_length = 4,
unknowns = c("uncultured", "uncultured class", "uncultured order", "uncultured family", "uncultured phylum", "Unknown Family family","Unknown", "endosymbionts", "Incertae Sedis", "Incertae Sedis class", "Incertae Sedis order","Incertae Sedis family"),
sep = " ", anon_unique = TRUE,
suffix_rank = "classified"
)
TotalRNA_fixed<-subset_taxa(TotalRNA_fixed,genus != "Unknown Family family")
# Save phyloseq objects for posterity
saveRDS(TotalRNA, here("TotalRNA.rds"))
saveRDS(TotalRNA_fixed, here("TotalRNA_fixed.rds"))