Skip to content

Commit

Permalink
add in alternative dada2 error functions (these require dplyr)
Browse files Browse the repository at this point in the history
  • Loading branch information
cjfields committed Jan 13, 2025
1 parent cfd0f83 commit e867f49
Showing 1 changed file with 305 additions and 0 deletions.
305 changes: 305 additions & 0 deletions lib/dada2_error_models.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
# For context, see https://github.com/benjjneb/dada2/issues/1307. Note these are
# likely being subsumed by a new error model recently introduced in DADA2:
# https://github.com/benjjneb/dada2/issues/1307#issuecomment-2521790524
loessErrfun_mod1 <- function(trans) {
qq <- as.numeric(colnames(trans))
est <- matrix(0, nrow=0, ncol=length(qq))
for(nti in c("A","C","G","T")) {
for(ntj in c("A","C","G","T")) {
if(nti != ntj) {
errs <- trans[paste0(nti,"2",ntj),]
tot <- colSums(trans[paste0(nti,"2",c("A","C","G","T")),])
rlogp <- log10((errs+1)/tot) # 1 psuedocount for each err, but if tot=0 will give NA
rlogp[is.infinite(rlogp)] <- NA
df <- data.frame(q=qq, errs=errs, tot=tot, rlogp=rlogp)

# original
# ###! mod.lo <- loess(rlogp ~ q, df, weights=errs) ###!
# mod.lo <- loess(rlogp ~ q, df, weights=tot) ###!
# # mod.lo <- loess(rlogp ~ q, df)

# Gulliem Salazar's solution
# https://github.com/benjjneb/dada2/issues/938
mod.lo <- loess(rlogp ~ q, df, weights = log10(tot),span = 2)

pred <- predict(mod.lo, qq)
maxrli <- max(which(!is.na(pred)))
minrli <- min(which(!is.na(pred)))
pred[seq_along(pred)>maxrli] <- pred[[maxrli]]
pred[seq_along(pred)<minrli] <- pred[[minrli]]
est <- rbind(est, 10^pred)
} # if(nti != ntj)
} # for(ntj in c("A","C","G","T"))
} # for(nti in c("A","C","G","T"))

# HACKY
MAX_ERROR_RATE <- 0.25
MIN_ERROR_RATE <- 1e-7
est[est>MAX_ERROR_RATE] <- MAX_ERROR_RATE
est[est<MIN_ERROR_RATE] <- MIN_ERROR_RATE

# enforce monotonicity
# https://github.com/benjjneb/dada2/issues/791
estorig <- est
est <- est %>%
data.frame() %>%
mutate_all(funs(case_when(. < X40 ~ X40,
. >= X40 ~ .))) %>% as.matrix()
rownames(est) <- rownames(estorig)
colnames(est) <- colnames(estorig)

# Expand the err matrix with the self-transition probs
err <- rbind(1-colSums(est[1:3,]), est[1:3,],
est[4,], 1-colSums(est[4:6,]), est[5:6,],
est[7:8,], 1-colSums(est[7:9,]), est[9,],
est[10:12,], 1-colSums(est[10:12,]))
rownames(err) <- paste0(rep(c("A","C","G","T"), each=4), "2", c("A","C","G","T"))
colnames(err) <- colnames(trans)
# Return
return(err)
}

loessErrfun_mod2 <- function(trans) {
qq <- as.numeric(colnames(trans))
est <- matrix(0, nrow=0, ncol=length(qq))
for(nti in c("A","C","G","T")) {
for(ntj in c("A","C","G","T")) {
if(nti != ntj) {
errs <- trans[paste0(nti,"2",ntj),]
tot <- colSums(trans[paste0(nti,"2",c("A","C","G","T")),])
rlogp <- log10((errs+1)/tot) # 1 psuedocount for each err, but if tot=0 will give NA
rlogp[is.infinite(rlogp)] <- NA
df <- data.frame(q=qq, errs=errs, tot=tot, rlogp=rlogp)

# original
# ###! mod.lo <- loess(rlogp ~ q, df, weights=errs) ###!
mod.lo <- loess(rlogp ~ q, df, weights=tot) ###!
# # mod.lo <- loess(rlogp ~ q, df)

# Gulliem Salazar's solution
# https://github.com/benjjneb/dada2/issues/938
# mod.lo <- loess(rlogp ~ q, df, weights = log10(tot),span = 2)

pred <- predict(mod.lo, qq)
maxrli <- max(which(!is.na(pred)))
minrli <- min(which(!is.na(pred)))
pred[seq_along(pred)>maxrli] <- pred[[maxrli]]
pred[seq_along(pred)<minrli] <- pred[[minrli]]
est <- rbind(est, 10^pred)
} # if(nti != ntj)
} # for(ntj in c("A","C","G","T"))
} # for(nti in c("A","C","G","T"))

# HACKY
MAX_ERROR_RATE <- 0.25
MIN_ERROR_RATE <- 1e-7
est[est>MAX_ERROR_RATE] <- MAX_ERROR_RATE
est[est<MIN_ERROR_RATE] <- MIN_ERROR_RATE

# enforce monotonicity
# https://github.com/benjjneb/dada2/issues/791
estorig <- est
est <- est %>%
data.frame() %>%
mutate_all(funs(case_when(. < X40 ~ X40,
. >= X40 ~ .))) %>% as.matrix()
rownames(est) <- rownames(estorig)
colnames(est) <- colnames(estorig)

# Expand the err matrix with the self-transition probs
err <- rbind(1-colSums(est[1:3,]), est[1:3,],
est[4,], 1-colSums(est[4:6,]), est[5:6,],
est[7:8,], 1-colSums(est[7:9,]), est[9,],
est[10:12,], 1-colSums(est[10:12,]))
rownames(err) <- paste0(rep(c("A","C","G","T"), each=4), "2", c("A","C","G","T"))
colnames(err) <- colnames(trans)
# Return
return(err)
}

loessErrfun_mod3 <- function(trans) {
qq <- as.numeric(colnames(trans))
est <- matrix(0, nrow=0, ncol=length(qq))
for(nti in c("A","C","G","T")) {
for(ntj in c("A","C","G","T")) {
if(nti != ntj) {
errs <- trans[paste0(nti,"2",ntj),]
tot <- colSums(trans[paste0(nti,"2",c("A","C","G","T")),])
rlogp <- log10((errs+1)/tot) # 1 psuedocount for each err, but if tot=0 will give NA
rlogp[is.infinite(rlogp)] <- NA
df <- data.frame(q=qq, errs=errs, tot=tot, rlogp=rlogp)

# original
# ###! mod.lo <- loess(rlogp ~ q, df, weights=errs) ###!
# mod.lo <- loess(rlogp ~ q, df, weights=tot) ###!
# # mod.lo <- loess(rlogp ~ q, df)

# Gulliem Salazar's solution
# https://github.com/benjjneb/dada2/issues/938
# mod.lo <- loess(rlogp ~ q, df, weights = log10(tot),span = 2)

# only change the weights
mod.lo <- loess(rlogp ~ q, df, weights = log10(tot))

pred <- predict(mod.lo, qq)
maxrli <- max(which(!is.na(pred)))
minrli <- min(which(!is.na(pred)))
pred[seq_along(pred)>maxrli] <- pred[[maxrli]]
pred[seq_along(pred)<minrli] <- pred[[minrli]]
est <- rbind(est, 10^pred)
} # if(nti != ntj)
} # for(ntj in c("A","C","G","T"))
} # for(nti in c("A","C","G","T"))

# HACKY
MAX_ERROR_RATE <- 0.25
MIN_ERROR_RATE <- 1e-7
est[est>MAX_ERROR_RATE] <- MAX_ERROR_RATE
est[est<MIN_ERROR_RATE] <- MIN_ERROR_RATE

# enforce monotonicity
# https://github.com/benjjneb/dada2/issues/791
estorig <- est
est <- est %>%
data.frame() %>%
mutate_all(funs(case_when(. < X40 ~ X40,
. >= X40 ~ .))) %>% as.matrix()
rownames(est) <- rownames(estorig)
colnames(est) <- colnames(estorig)

# Expand the err matrix with the self-transition probs
err <- rbind(1-colSums(est[1:3,]), est[1:3,],
est[4,], 1-colSums(est[4:6,]), est[5:6,],
est[7:8,], 1-colSums(est[7:9,]), est[9,],
est[10:12,], 1-colSums(est[10:12,]))
rownames(err) <- paste0(rep(c("A","C","G","T"), each=4), "2", c("A","C","G","T"))
colnames(err) <- colnames(trans)
# Return
return(err)
}

loessErrfun_mod4 <- function(trans) {
qq <- as.numeric(colnames(trans))
est <- matrix(0, nrow=0, ncol=length(qq))
for(nti in c("A","C","G","T")) {
for(ntj in c("A","C","G","T")) {
if(nti != ntj) {
errs <- trans[paste0(nti,"2",ntj),]
tot <- colSums(trans[paste0(nti,"2",c("A","C","G","T")),])
rlogp <- log10((errs+1)/tot) # 1 psuedocount for each err, but if tot=0 will give NA
rlogp[is.infinite(rlogp)] <- NA
df <- data.frame(q=qq, errs=errs, tot=tot, rlogp=rlogp)

# original
# ###! mod.lo <- loess(rlogp ~ q, df, weights=errs) ###!
# mod.lo <- loess(rlogp ~ q, df, weights=tot) ###!
# # mod.lo <- loess(rlogp ~ q, df)

# jonalim's solution
# https://github.com/benjjneb/dada2/issues/938
mod.lo <- loess(rlogp ~ q, df, weights = log10(tot),degree = 1, span = 0.95)

pred <- predict(mod.lo, qq)
maxrli <- max(which(!is.na(pred)))
minrli <- min(which(!is.na(pred)))
pred[seq_along(pred)>maxrli] <- pred[[maxrli]]
pred[seq_along(pred)<minrli] <- pred[[minrli]]
est <- rbind(est, 10^pred)
} # if(nti != ntj)
} # for(ntj in c("A","C","G","T"))
} # for(nti in c("A","C","G","T"))

# HACKY
MAX_ERROR_RATE <- 0.25
MIN_ERROR_RATE <- 1e-7
est[est>MAX_ERROR_RATE] <- MAX_ERROR_RATE
est[est<MIN_ERROR_RATE] <- MIN_ERROR_RATE

# enforce monotonicity
# https://github.com/benjjneb/dada2/issues/791
estorig <- est
est <- est %>%
data.frame() %>%
mutate_all(funs(case_when(. < X40 ~ X40,
. >= X40 ~ .))) %>% as.matrix()
rownames(est) <- rownames(estorig)
colnames(est) <- colnames(estorig)

# Expand the err matrix with the self-transition probs
err <- rbind(1-colSums(est[1:3,]), est[1:3,],
est[4,], 1-colSums(est[4:6,]), est[5:6,],
est[7:8,], 1-colSums(est[7:9,]), est[9,],
est[10:12,], 1-colSums(est[10:12,]))
rownames(err) <- paste0(rep(c("A","C","G","T"), each=4), "2", c("A","C","G","T"))
colnames(err) <- colnames(trans)
# Return
return(err)
}

loessErrfun_mod4 <- function(trans) {
qq <- as.numeric(colnames(trans))
est <- matrix(0, nrow=0, ncol=length(qq))
for(nti in c("A","C","G","T")) {
for(ntj in c("A","C","G","T")) {
if(nti != ntj) {
errs <- trans[paste0(nti,"2",ntj),]
tot <- colSums(trans[paste0(nti,"2",c("A","C","G","T")),])
rlogp <- log10((errs+1)/tot) # 1 psuedocount for each err, but if tot=0 will give NA
rlogp[is.infinite(rlogp)] <- NA
df <- data.frame(q=qq, errs=errs, tot=tot, rlogp=rlogp)

# original
# ###! mod.lo <- loess(rlogp ~ q, df, weights=errs) ###!
# mod.lo <- loess(rlogp ~ q, df, weights=tot) ###!
# # mod.lo <- loess(rlogp ~ q, df)

# jonalim's solution
# https://github.com/benjjneb/dada2/issues/938
mod.lo <- loess(rlogp ~ q, df, weights = log10(tot),degree = 1, span = 0.95)

pred <- predict(mod.lo, qq)
maxrli <- max(which(!is.na(pred)))
minrli <- min(which(!is.na(pred)))
pred[seq_along(pred)>maxrli] <- pred[[maxrli]]
pred[seq_along(pred)<minrli] <- pred[[minrli]]
est <- rbind(est, 10^pred)
} # if(nti != ntj)
} # for(ntj in c("A","C","G","T"))
} # for(nti in c("A","C","G","T"))

# HACKY
MAX_ERROR_RATE <- 0.25
MIN_ERROR_RATE <- 1e-7
est[est>MAX_ERROR_RATE] <- MAX_ERROR_RATE
est[est<MIN_ERROR_RATE] <- MIN_ERROR_RATE

# enforce monotonicity
# https://github.com/benjjneb/dada2/issues/791
estorig <- est
est <- est %>%
data.frame() %>%
mutate_all(funs(case_when(. < X40 ~ X40,
. >= X40 ~ .))) %>% as.matrix()
rownames(est) <- rownames(estorig)
colnames(est) <- colnames(estorig)

# Expand the err matrix with the self-transition probs
err <- rbind(1-colSums(est[1:3,]), est[1:3,],
est[4,], 1-colSums(est[4:6,]), est[5:6,],
est[7:8,], 1-colSums(est[7:9,]), est[9,],
est[10:12,], 1-colSums(est[10:12,]))
rownames(err) <- paste0(rep(c("A","C","G","T"), each=4), "2", c("A","C","G","T"))
colnames(err) <- colnames(trans)
# Return
return(err)
}

# check what this looks like
errF_4 <- learnErrors(
filtFs,
multithread = TRUE,
nbases = 1e10,
errorEstimationFunction = loessErrfun_mod4,
verbose = TRUE
)

0 comments on commit e867f49

Please sign in to comment.