Skip to content

Commit bb97fcf

Browse files
authoredSep 26, 2024··
Merge pull request #7 from brendanf/amplicon_extract
Amplicon extract
2 parents 51de0bc + bd1e205 commit bb97fcf

15 files changed

+662
-156
lines changed
 

‎.github/workflows/check-bioc.yml

+13-5
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ jobs:
5454
config:
5555
- { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest" }
5656
- { os: macOS-latest, r: '4.3', bioc: '3.18'}
57-
- { os: windows-latest, r: '4.3', bioc: '3.18'}
5857
## Check https://github.com/r-lib/actions/tree/master/examples
5958
## for examples using the http-user-agent
6059
env:
@@ -143,6 +142,10 @@ jobs:
143142
## Required for tcltk
144143
brew install xquartz --cask
145144
145+
## install infernal
146+
brew tap brewsci/bio
147+
brew install infernal
148+
146149
- name: Install Windows system dependencies
147150
if: runner.os == 'Windows'
148151
run: |
@@ -174,14 +177,19 @@ jobs:
174177
install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories())
175178
176179
## Pass #1 at installing dependencies
180+
## This pass uses AnVIL-powered fast binaries
181+
## details at https://github.com/nturaga/bioc2021-bioconductor-binaries
182+
## The speed gains only apply to the docker builds.
177183
message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
178-
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
184+
remotes::install_local(dependencies = TRUE, repos = gha_repos, build_vignettes = FALSE, upgrade = TRUE)
179185
continue-on-error: true
180186
shell: Rscript {0}
181187

182188
- name: Install dependencies pass 2
183189
run: |
184190
## Pass #2 at installing dependencies
191+
## This pass does not use AnVIL and will thus update any packages
192+
## that have seen been updated in Bioconductor
185193
message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
186194
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE)
187195
shell: Rscript {0}
@@ -260,7 +268,7 @@ jobs:
260268
run: R CMD INSTALL .
261269

262270
- name: Build pkgdown site
263-
if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
271+
if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
264272
run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
265273
shell: Rscript {0}
266274
## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
@@ -269,12 +277,12 @@ jobs:
269277
## makes the git history recognizable by pkgdown.
270278

271279
- name: Install deploy dependencies
272-
if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
280+
if: github.ref == 'refs/heads/{{pkgdown_covr_branch}}' && env.run_pkgdown == 'true' && runner.os == 'Linux'
273281
run: |
274282
apt-get update && apt-get -y install rsync
275283
276284
- name: Deploy pkgdown site to GitHub pages 🚀
277-
if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
285+
if: github.ref == 'refs/heads/{{pkgdown_covr_branch}}' && env.run_pkgdown == 'true' && runner.os == 'Linux'
278286
uses: JamesIves/github-pages-deploy-action@releases/v4
279287
with:
280288
clean: false

‎DESCRIPTION

+4-5
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,24 @@ Imports:
1919
ShortRead,
2020
dplyr,
2121
futile.logger,
22-
inferrnal (>= 0.99.7),
22+
inferrnal (>= 0.99.8),
2323
IRanges,
2424
methods,
2525
purrr,
26-
readr,
2726
rlang,
2827
stats,
2928
stringr,
3029
stringi,
3130
tibble,
3231
tidyr,
33-
tidyselect,
34-
utils
32+
tidyselect
3533
Suggests:
3634
rITSx (>= 0.0.5),
3735
knitr,
3836
rmarkdown,
3937
covr,
4038
testthat (>= 2.1.0)
41-
RoxygenNote: 7.2.3
39+
RoxygenNote: 7.3.2
4240
biocViews:
4341
SequenceMatching,
4442
ThirdPartyClient,
@@ -55,3 +53,4 @@ Remotes:
5553
brendanf/rITSx
5654
Depends:
5755
R (>= 4.1)
56+
Roxygen: list(markdown = TRUE)

‎NAMESPACE

+14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
S3method(extract_LSU,MultipleAlignment)
4+
S3method(extract_LSU,character)
5+
S3method(extract_rf_region,XString)
6+
S3method(extract_rf_region,character)
7+
S3method(map_position,MultipleAlignment)
8+
S3method(map_position,character)
9+
S3method(protect_names,ShortRead)
10+
S3method(protect_names,character)
11+
S3method(protect_names,default)
12+
S3method(repair_unmatched_secondary_structure,BString)
13+
S3method(repair_unmatched_secondary_structure,character)
14+
export(find_amplicon)
15+
export(find_primer)
16+
export(gap_fill)
317
export(lsux)
418
export(merge_5_8S)
519
export(repair_unmatched_secondary_structure)

‎NEWS.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# Development version
22

33
* Added tests for most functionality.
4+
* Reimplemented `truncate_alignment()` using `narrow()` method for Stockholm
5+
alignments from `inferrnal` 0.99.8. It now handles interleaved Stockholm
6+
files.
7+
* Added `find_amplicon()` to locate and mark or extract the region defined by a
8+
primer pair. Also exported are helper functions `find_primer()` and
9+
`gap_fill()`.
410

511
# LSUx 0.99.6
612

‎R/amplicon_extract.R

+335
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
#' Convert range from ungapped sequence to gapped sequence
2+
#'
3+
#' @param rstart
4+
#' (`integer` scalar)
5+
#' the start point of the range
6+
#' @param rend
7+
#' (`integer` scalar)
8+
#' the end point of the range
9+
#' @param gaps
10+
#' ([`IRanges`][IRanges::IRanges-constructor] object)
11+
#' the locations of gaps in the gapped sequence.
12+
#'
13+
#' @return named `integer` of length two, with elements `"start"` and `"end"`,
14+
#' giving the coordinates in the gapped sequence which correspond to coordinates
15+
#' `"rstart"` and `"rend"` in the ungapped sequence.
16+
#' @export
17+
gap_fill <- function(rstart, rend, gaps) {
18+
i = 1L
19+
wsum = 0L
20+
preInsert <- 0L
21+
postInsert <- 0L
22+
for (i in seq_along(gaps)) {
23+
wsum <- wsum + gaps@width[i]
24+
insert <- gaps@start[i] + gaps@width[i] - 1L - wsum
25+
if (insert < rstart) {
26+
preInsert <- wsum
27+
}
28+
if (insert < rend) {
29+
postInsert <- wsum
30+
} else {
31+
break
32+
}
33+
}
34+
c(
35+
start = preInsert + rstart,
36+
end = postInsert + rend
37+
)
38+
}
39+
40+
#' Find the best location for a primer sequence in a gappy alignment
41+
#'
42+
#' This is a relatively simple algorithm which aligns the primer sequences to
43+
#' each sequence in the alignment, maps the aligned locations into the alignment,
44+
#' and takes the most frequent location. A warning is issued if more than 10% of
45+
#' the sequences in the alignment have non-consensus primer positions; a
46+
#' frequent cause of this is if some of the sequences are fragmentary and do not
47+
#' actually include the primer site, but in any case it is recommended to
48+
#' manually check results.
49+
#'
50+
#' @param ungapped
51+
#' ([`DNAStringSet`][Biostrings::XStringSet-class] object)
52+
#' The unaligned, gap-free sequences in the alignment.
53+
#' @param gaps
54+
#' (`list` of [`IRanges`][IRanges::IRanges-class] objects)
55+
#' The locations of gaps in each sequence of `ungapped` when aligned.
56+
#' @param primer ([`DNAString`][Biostrings::XString-class] object)
57+
#' The primer sequence to search for. It should be in the orientation which
58+
#' matches the sequences in the alignment. (I.e., reverse primer sequences
59+
#' should be reverse complemented prior to calling `find_primer()`.)
60+
#'
61+
#' @return named `integer` of length two, with elements `start` and `end` giving
62+
#' the best fit range for the primer in the alignment.
63+
#' @export
64+
find_primer <- function(ungapped, gaps, primer) {
65+
aln <- Biostrings::pairwiseAlignment(
66+
ungapped,
67+
primer,
68+
type = "local-global",
69+
substitutionMatrix = Biostrings::nucleotideSubstitutionMatrix()
70+
)
71+
result <- mapply(
72+
gap_fill,
73+
aln@pattern@range@start,
74+
aln@pattern@range@start + aln@pattern@range@width - 1L,
75+
gaps
76+
)
77+
start <- result[1,]
78+
end <- result[2,]
79+
bestscore <- max(aln@score)
80+
beststart <- as.integer(names(which.max(table(start))))
81+
bestend <- as.integer(names(which.max(table(end))))
82+
startmismatch <- start != beststart
83+
endmismatch <- end != bestend
84+
if (sum(startmismatch | endmismatch) > 0.1 * length(start))
85+
warning("more than 10% of sequences in reference alignment have variant\n",
86+
"locations for primer ", as.character(primer))
87+
c(start = beststart, end = bestend, score = bestscore)
88+
}
89+
90+
as_StockholmMSA <- function(aln, name = "aln") {
91+
if (methods::is(aln, "StockholmDNAMultipleAlignment") ||
92+
methods::is(aln, "StockholmRNAMultipleAlignment")) {
93+
# ok, no problem
94+
aln
95+
} else if (methods::is(aln, "connection")) {
96+
assertthat::assert_that(summary(aln)[["can read"]] == "yes")
97+
inferrnal::read_stockholm_msa(aln)
98+
} else if (is.character(aln) &&
99+
length(aln) == 1 &&
100+
assertthat::is.readable(aln)) {
101+
inferrnal::read_stockholm_msa(aln)
102+
} else if (is.character(aln)) {
103+
# alignment given as character string
104+
assertthat::assert_that(
105+
dplyr::n_distinct(nchar(aln)) == 1
106+
)
107+
tryCatch(
108+
inferrnal::StockholmRNAMultipleAlignment(aln),
109+
error = function(e) inferrnal::StockholmRNAMultipleAlignment(aln)
110+
)
111+
} else if (methods::is(aln, "DNAStringSet") ||
112+
methods::is(aln, "DNAMultipleAlignment")) {
113+
inferrnal::StockholmDNAMultipleAlignment(aln)
114+
} else if (methods::is(aln, "RNAStringSet") ||
115+
methods::is(aln, "RNAMultipleAlignment")) {
116+
inferrnal::StockholmRNAMultipleAlignment(aln)
117+
} else {
118+
stop("'", name, "' should be a connection, a filename, or a DNA or RNA alignment")
119+
}
120+
}
121+
122+
mark_ref_line <- function(aln, start, end, mark_char) {
123+
# ensure there is a valid reference line in the alignment
124+
if (!"RF" %in% names(aln@GC)) {
125+
# make a reference line out of the alignment consensus
126+
aln@GC$RF <- chartr(".", "-", aln@unmasked) |>
127+
Biostrings::consensusString() |>
128+
chartr(old = "-", new = ".")
129+
} else {
130+
if (grepl(mark_char, aln@GC$RF)) {
131+
warning("warning:reference line of supplied alignment includes",
132+
"character", shQuote(mark_char), ".")
133+
}
134+
}
135+
# find non-gap positions in the reference line
136+
refpos <- IRanges::gaps(
137+
Biostrings::matchPattern(".", aln@GC$RF),
138+
start = 1,
139+
end = Biostrings::nchar(aln@GC$RF)
140+
)
141+
142+
# find the non-gap positions which match the primers
143+
mark_refpos <- IRanges::findOverlapPairs(
144+
refpos,
145+
IRanges::IRanges(start = start, end = end)
146+
)
147+
mark_refpos <- IRanges::pintersect(mark_refpos)
148+
149+
aln@GC$RF <- Biostrings::replaceAt(
150+
aln@GC$RF,
151+
mark_refpos,
152+
c(strrep(mark_char, mark_refpos@width))
153+
)
154+
aln
155+
}
156+
157+
#' Find the location of an amplicon in an alignment
158+
#'
159+
#' @param aln
160+
#' (`connection`, `character` string giving a file name in Stockholm format,
161+
#' [`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class],
162+
#' [`RNAMultipleAlignment`][Biostrings::MultipleAlignment-class],
163+
#' [`DNAStringSet`][Biostrings::XStringSet-class],
164+
#' [`RNAStringSet`][Biostrings::XStringSet-class],
165+
#' [`StockholmMultipleAlignment`][inferrnal::StockholmMultipleAlignment-class],
166+
#' or `character` vector)
167+
#' DNA or RNA multiple alignment in which to search for an amplicon.
168+
#' @param fwd_primer
169+
#' (`character` string or [`DNAString`][Biostrings::XString-class])
170+
#' Forward primer sequence to define the target amplicon.
171+
#' @param rev_primer
172+
#' (`character` string or [`DNAString`][Biostrings::XString-class])
173+
#' Reverse primer sequence to define the target amplicon. Should be given from
174+
#' 5' to 3' in the primer; i.e. the reverse complement of the expected sequence
175+
#' in the alignment.
176+
#' @param trim
177+
#' (one of `"none"`, `"retain"`, or `"remove"`)
178+
#' Choice of how to trim the alignment: if `"none"` then the alignment is not
179+
#' trimmed; if `"retain"` then the alignment is trimmed to the amplicon,
180+
#' including the primer sites; if `"remove"` then the alignment is trimmed to
181+
#' the amplicon and the primer sites are also removed.
182+
#' @param mark
183+
#' (`logical` flag)
184+
#' If `TRUE` (default) the primer sites are marked in the alignment RF line.
185+
#' @param fwd_char
186+
#' (single `character`)
187+
#' Character to use for marking the forward primer location in the RF line.
188+
#' @param rev_char
189+
#' (single `character`)
190+
#' Character to use for marking the reverse primer location in the RF line.
191+
#' @param outfile
192+
#' (`character` file name or [`connection`])
193+
#' If non-`NULL`, an output file or connection to write the result to.
194+
#'
195+
#' @return [`StockholmMultipleAlignment`][inferrnal::StockholmMultipleAlignment-class]
196+
#' object with modified RF line to mark the primer locations, or if `outfile` is
197+
#' given, `NULL` invisibly.
198+
#' @export
199+
find_amplicon <- function(aln, fwd_primer, rev_primer,
200+
trim = c("none", "retain", "remove"),
201+
mark = TRUE, fwd_char = "{",
202+
rev_char = "}", outfile = NULL) {
203+
204+
aln <- as_StockholmMSA(aln)
205+
206+
if (is.character(fwd_primer)) fwd_primer <- Biostrings::DNAString(fwd_primer)
207+
assertthat::assert_that(methods::is(fwd_primer, "DNAString"))
208+
if (is.character(rev_primer)) rev_primer <- Biostrings::DNAString(rev_primer)
209+
assertthat::assert_that(methods::is(rev_primer, "DNAString"))
210+
rev_primer <- Biostrings::reverseComplement(rev_primer)
211+
212+
assertthat::assert_that(
213+
assertthat::is.string(fwd_char),
214+
nchar(fwd_char) == 1L
215+
)
216+
217+
assertthat::assert_that(
218+
assertthat::is.string(rev_char),
219+
nchar(rev_char) == 1L
220+
)
221+
222+
assertthat::assert_that(fwd_char != rev_char)
223+
224+
# find gaps in the reference alignment. both "." and "-" are gaps
225+
gaps <-
226+
mapply(
227+
Biostrings::union,
228+
Biostrings::vmatchPattern(".", aln@unmasked),
229+
Biostrings::vmatchPattern("-", aln@unmasked)
230+
)
231+
232+
# get the ungapped reference sequences
233+
ungapped <-
234+
lapply(gaps, Biostrings::gaps, start = 1, end = ncol(aln)) |>
235+
mapply(
236+
FUN = function(x, ranges) x[ranges],
237+
x = aln@unmasked
238+
)
239+
if (methods::is(aln, "StockholmRNAMultipleAlignment")) {
240+
ungapped <- Biostrings::RNAStringSet(ungapped)
241+
}
242+
ungapped <- Biostrings::DNAStringSet(ungapped)
243+
244+
# find the primers in the ungapped sequences, and map positions back into the
245+
# alignment
246+
result_fwd <- find_primer(ungapped, gaps, fwd_primer)
247+
result_rev <- find_primer(ungapped, gaps, rev_primer)
248+
249+
if (isTRUE(mark)) {
250+
# replace the RF line with the new primer line.
251+
aln <- mark_ref_line(aln, result_fwd["start"], result_fwd["end"], fwd_char)
252+
aln <- mark_ref_line(aln, result_rev["start"], result_rev["end"], rev_char)
253+
}
254+
if (trim == "retain") {
255+
truncate_alignment(
256+
aln,
257+
outfile = outfile,
258+
start = result_fwd["start"],
259+
stop = result_rev["end"]
260+
)
261+
} else if (trim == "remove") {
262+
truncate_alignment(
263+
aln,
264+
outfile = outfile,
265+
start = result_fwd["end"] + 1L,
266+
stop = result_rev["start"] - 1L
267+
)
268+
} else if (!is.null(outfile)) {
269+
inferrnal::writeStockholmMultipleAlignment(aln, outfile)
270+
invisible(NULL)
271+
} else {
272+
aln
273+
}
274+
}
275+
276+
modify_cm_rf <- function(infile, outfile, rf) {
277+
if (assertthat::is.readable(infile)) {
278+
infile <- file(infile, open = "rt")
279+
}
280+
assertthat::assert_that(
281+
methods::is(infile, "connection"),
282+
assertthat::is.string(rf)
283+
)
284+
if (assertthat::is.string(outfile)) {
285+
file.create(outfile)
286+
outfile <- file(outfile, open = "wt")
287+
}
288+
assertthat::assert_that(
289+
methods::is(outfile, "connection")
290+
)
291+
rf_width <- nchar(rf)
292+
293+
while (length(l <- readLines(infile, 1000L)) > 0) {
294+
# make sure we have alignment mapping
295+
if (any(grepl("^MAP +no", l))) {
296+
stop("input CM does not have alignment mapping")
297+
}
298+
# if the CM didn't have an RF line before, it will when we're done with it.
299+
RF_lines <- which(grepl("^RF +no"))
300+
l[RF_lines] <- sub("no", "yes", l[RF_lines], fixed = TRUE)
301+
302+
# modify RF characters for CM
303+
MAT_lines <- which(grepl("\\[ +MAT[PRL] ", l))
304+
for (i in MAT_lines) {
305+
fields <- strsplit(trimws(l[i]), " +")[[1]]
306+
spaces <- strsplit(l[i], "[^ ]+")[[1]]
307+
type <- fields[2]
308+
if (fields %in% c("MATL", "MATP")) {
309+
pos <- as.integer(fields[5])
310+
fields[9] <- substr(rf, pos, pos)
311+
}
312+
if (fields %in% c("MATR", "MATP")) {
313+
pos <- as.integer(fields[6])
314+
fields[10] <- substr(rf, pos, pos)
315+
}
316+
l[i] <- paste(c(spaces, fields)[order(c(seq_along(spaces), seq_along(fields)))], collapse = "")
317+
}
318+
319+
# modify RF characters for HMM
320+
HMM_lines <- which(grepl(" *[1-9]\\d* +([0-9]\\.[0-9]+ +){4}[1-9][0-9]* +. +. +.", l))
321+
for (i in HMM_lines) {
322+
fields <- strsplit(trimws(l[i]), " +")[[1]]
323+
spaces <- strsplit(l[i], "[^ ]+")[[1]]
324+
pos <- fields[6]
325+
fields[8] <- substr(rf, pos, pos)
326+
l[i] <- paste(c(spaces, fields)[order(c(seq_along(spaces), seq_along(fields)))], collapse = "")
327+
}
328+
writeLines(outfile, l)
329+
}
330+
invisible(NULL)
331+
}
332+
333+
extract_amplicon <- function(seqs, aln) {
334+
335+
}

‎R/lsux.R

+116-97
Large diffs are not rendered by default.

‎R/util.R

+3
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ protect_names <- function(seq) {
160160
UseMethod("protect_names")
161161
}
162162

163+
#' @export
163164
protect_names.ShortRead <- function(seq) {
164165
seq_id <- as.character(ShortRead::id(seq))
165166
seq@id <- Biostrings::BStringSet(as.character(seq_along(seq)))
@@ -170,6 +171,7 @@ protect_names.ShortRead <- function(seq) {
170171
)
171172
}
172173

174+
#' @export
173175
protect_names.default <- function(seq) {
174176
seq_id <- names(seq)
175177
names(seq) <- as.character(seq_along(seq))
@@ -180,6 +182,7 @@ protect_names.default <- function(seq) {
180182
)
181183
}
182184

185+
#' @export
183186
protect_names.character <- function(seq) {
184187
if (length(seq) == 1 && file.exists(seq)) {
185188
seq <- tryCatch(

‎man/find_amplicon.Rd

+61
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/find_primer.Rd

+33
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/gap_fill.Rd

+26
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/itsx_result.Rd

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/lsux.Rd

+28-30
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/merge_5_8S.Rd

+6-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎man/truncate_alignment.Rd

+15-11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎tests/testthat/test-repair_ss.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,6 @@ test_that(
3131
"truncate_alignment rejects bad connections",
3232
{
3333
expect_error(truncate_alignment(3, test_out, 1, 500))
34-
expect_error(truncate_alignment(test_in, NULL, 1, 500))
34+
expect_error(truncate_alignment(test_in, NA, 1, 500))
3535
}
3636
)

0 commit comments

Comments
 (0)
Please sign in to comment.