brendanf · Sep 26, 2024
diff --git a/‎.github/workflows/check-bioc.yml
+13-5 b/‎.github/workflows/check-bioc.yml
+13-5
diff --git a/‎DESCRIPTION
+4-5 b/‎DESCRIPTION
+4-5
diff --git a/‎NAMESPACE
+14 b/‎NAMESPACE
+14
diff --git a/‎NEWS.md
+6 b/‎NEWS.md
+6
diff --git a/‎R/amplicon_extract.R
+335 b/‎R/amplicon_extract.R
+335
diff --git a/‎R/lsux.R
+116-97 b/‎R/lsux.R
+116-97
diff --git a/‎R/util.R
+3 b/‎R/util.R
+3
diff --git a/‎man/find_amplicon.Rd
+61 b/‎man/find_amplicon.Rd
+61
diff --git a/‎man/find_primer.Rd
+33 b/‎man/find_primer.Rd
+33
diff --git a/‎man/gap_fill.Rd
+26 b/‎man/gap_fill.Rd
+26
diff --git a/‎man/itsx_result.Rd
+1-1 b/‎man/itsx_result.Rd
+1-1
diff --git a/‎man/lsux.Rd
+28-30 b/‎man/lsux.Rd
+28-30
diff --git a/‎man/merge_5_8S.Rd
+6-6 b/‎man/merge_5_8S.Rd
+6-6
diff --git a/‎man/truncate_alignment.Rd
+15-11 b/‎man/truncate_alignment.Rd
+15-11
diff --git a/‎tests/testthat/test-repair_ss.R
+1-1 b/‎tests/testthat/test-repair_ss.R
+1-1
@@ -54,7 +54,6 @@ jobs:
         config:
           - { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest" }
           - { os: macOS-latest, r: '4.3', bioc: '3.18'}
-          - { os: windows-latest, r: '4.3', bioc: '3.18'}
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -143,6 +142,10 @@ jobs:
           ## Required for tcltk
           brew install xquartz --cask
 
+          ## install infernal
+          brew tap brewsci/bio
+          brew install infernal
+
       - name: Install Windows system dependencies
         if: runner.os == 'Windows'
         run: |
@@ -174,14 +177,19 @@ jobs:
           install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories())
 
           ## Pass #1 at installing dependencies
+          ## This pass uses AnVIL-powered fast binaries
+          ## details at https://github.com/nturaga/bioc2021-bioconductor-binaries
+          ## The speed gains only apply to the docker builds.
           message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
-          remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
+          remotes::install_local(dependencies = TRUE, repos = gha_repos, build_vignettes = FALSE, upgrade = TRUE)
         continue-on-error: true
         shell: Rscript {0}
 
       - name: Install dependencies pass 2
         run: |
           ## Pass #2 at installing dependencies
+          ## This pass does not use AnVIL and will thus update any packages
+          ## that have seen been updated in Bioconductor
           message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
           remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE)
         shell: Rscript {0}
@@ -260,7 +268,7 @@ jobs:
         run: R CMD INSTALL .
 
       - name: Build pkgdown site
-        if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
+        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
         shell: Rscript {0}
         ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
@@ -269,12 +277,12 @@ jobs:
         ## makes the git history recognizable by pkgdown.
 
       - name: Install deploy dependencies
-        if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
+        if: github.ref == 'refs/heads/{{pkgdown_covr_branch}}' && env.run_pkgdown == 'true' && runner.os == 'Linux'
         run: |
           apt-get update && apt-get -y install rsync
 
       - name: Deploy pkgdown site to GitHub pages 🚀
-        if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
+        if: github.ref == 'refs/heads/{{pkgdown_covr_branch}}' && env.run_pkgdown == 'true' && runner.os == 'Linux'
         uses: JamesIves/github-pages-deploy-action@releases/v4
         with:
           clean: false
 
@@ -19,26 +19,24 @@ Imports:
     ShortRead,
     dplyr,
     futile.logger,
-    inferrnal (>= 0.99.7),
+    inferrnal (>= 0.99.8),
     IRanges,
     methods,
     purrr,
-    readr,
     rlang,
     stats,
     stringr,
     stringi,
     tibble,
     tidyr,
-    tidyselect,
-    utils
+    tidyselect
 Suggests:
     rITSx (>= 0.0.5),
     knitr,
     rmarkdown,
     covr,
     testthat (>= 2.1.0)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 biocViews:
     SequenceMatching,
     ThirdPartyClient,
@@ -55,3 +53,4 @@ Remotes:
     brendanf/rITSx
 Depends: 
     R (>= 4.1)
+Roxygen: list(markdown = TRUE)
@@ -1,5 +1,19 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(extract_LSU,MultipleAlignment)
+S3method(extract_LSU,character)
+S3method(extract_rf_region,XString)
+S3method(extract_rf_region,character)
+S3method(map_position,MultipleAlignment)
+S3method(map_position,character)
+S3method(protect_names,ShortRead)
+S3method(protect_names,character)
+S3method(protect_names,default)
+S3method(repair_unmatched_secondary_structure,BString)
+S3method(repair_unmatched_secondary_structure,character)
+export(find_amplicon)
+export(find_primer)
+export(gap_fill)
 export(lsux)
 export(merge_5_8S)
 export(repair_unmatched_secondary_structure)
 
@@ -1,6 +1,12 @@
 # Development version
 
 * Added tests for most functionality.
+* Reimplemented `truncate_alignment()` using `narrow()` method for Stockholm
+  alignments from `inferrnal` 0.99.8.  It now handles interleaved Stockholm
+  files.
+* Added `find_amplicon()` to locate and mark or extract the region defined by a
+  primer pair. Also exported are helper functions `find_primer()` and
+  `gap_fill()`.
 
 # LSUx 0.99.6
 
 
@@ -0,0 +1,335 @@
+#' Convert range from ungapped sequence to gapped sequence
+#'
+#' @param rstart
+#' (`integer` scalar)
+#' the start point of the range
+#' @param rend
+#' (`integer` scalar)
+#' the end point of the range
+#' @param gaps
+#' ([`IRanges`][IRanges::IRanges-constructor] object)
+#' the locations of gaps in the gapped sequence.
+#'
+#' @return named `integer` of length two, with elements `"start"` and `"end"`,
+#' giving the coordinates in the gapped sequence which correspond to coordinates
+#' `"rstart"` and `"rend"` in the ungapped sequence.
+#' @export
+gap_fill <- function(rstart, rend, gaps) {
+  i = 1L
+  wsum = 0L
+  preInsert <- 0L
+  postInsert <- 0L
+  for (i in seq_along(gaps)) {
+    wsum <- wsum + gaps@width[i]
+    insert <- gaps@start[i] + gaps@width[i] - 1L - wsum
+    if (insert < rstart) {
+      preInsert <- wsum
+    }
+    if (insert < rend) {
+      postInsert <- wsum
+    } else {
+      break
+    }
+  }
+  c(
+    start = preInsert + rstart,
+    end = postInsert + rend
+  )
+}
+
+#' Find the best location for a primer sequence in a gappy alignment
+#' 
+#' This is a relatively simple algorithm which aligns the primer sequences to
+#' each sequence in the alignment, maps the aligned locations into the alignment,
+#' and takes the most frequent location. A warning is issued if more than 10% of
+#' the sequences in the alignment have non-consensus primer positions; a
+#' frequent cause of this is if some of the sequences are fragmentary and do not
+#' actually include the primer site, but in any case it is recommended to
+#' manually check results.
+#'
+#' @param ungapped
+#' ([`DNAStringSet`][Biostrings::XStringSet-class] object)
+#' The unaligned, gap-free sequences in the alignment.
+#' @param gaps
+#' (`list` of [`IRanges`][IRanges::IRanges-class] objects)
+#' The locations of gaps in each sequence of `ungapped` when aligned.
+#' @param primer ([`DNAString`][Biostrings::XString-class] object)
+#' The primer sequence to search for. It should be in the orientation which
+#' matches the sequences in the alignment. (I.e., reverse primer sequences
+#' should be reverse complemented prior to calling `find_primer()`.)
+#'
+#' @return named `integer` of length two, with elements `start` and `end` giving
+#' the best fit range for the primer in the alignment.
+#' @export
+find_primer <- function(ungapped, gaps, primer) {
+  aln <- Biostrings::pairwiseAlignment(
+    ungapped,
+    primer,
+    type = "local-global",
+    substitutionMatrix = Biostrings::nucleotideSubstitutionMatrix()
+  )
+  result <- mapply(
+    gap_fill,
+    aln@pattern@range@start,
+    aln@pattern@range@start + aln@pattern@range@width - 1L,
+    gaps
+  )
+  start <- result[1,]
+  end <- result[2,]
+  bestscore <- max(aln@score)
+  beststart <- as.integer(names(which.max(table(start))))
+  bestend <- as.integer(names(which.max(table(end))))
+  startmismatch <- start != beststart
+  endmismatch <- end != bestend
+  if (sum(startmismatch | endmismatch) > 0.1 * length(start))
+    warning("more than 10% of sequences in reference alignment have variant\n",
+            "locations for primer ", as.character(primer))
+  c(start = beststart, end = bestend, score = bestscore)
+}
+
+as_StockholmMSA <- function(aln, name = "aln") {
+  if (methods::is(aln, "StockholmDNAMultipleAlignment") ||
+      methods::is(aln, "StockholmRNAMultipleAlignment")) {
+    # ok, no problem
+    aln
+  } else if (methods::is(aln, "connection")) {
+    assertthat::assert_that(summary(aln)[["can read"]] == "yes")
+    inferrnal::read_stockholm_msa(aln)
+  } else if (is.character(aln) &&
+             length(aln) == 1 && 
+             assertthat::is.readable(aln)) {
+    inferrnal::read_stockholm_msa(aln)
+  } else if (is.character(aln)) {
+    # alignment given as character string
+    assertthat::assert_that(
+      dplyr::n_distinct(nchar(aln)) == 1
+    )
+    tryCatch(
+      inferrnal::StockholmRNAMultipleAlignment(aln),
+      error = function(e) inferrnal::StockholmRNAMultipleAlignment(aln)
+    )
+  } else if (methods::is(aln, "DNAStringSet") ||
+             methods::is(aln, "DNAMultipleAlignment")) {
+    inferrnal::StockholmDNAMultipleAlignment(aln)
+  } else if (methods::is(aln, "RNAStringSet") ||
+             methods::is(aln, "RNAMultipleAlignment")) {
+    inferrnal::StockholmRNAMultipleAlignment(aln)
+  } else {
+    stop("'", name, "' should be a connection, a filename, or a DNA or RNA alignment")
+  }
+}
+
+mark_ref_line <- function(aln, start, end, mark_char) {
+    # ensure there is a valid reference line in the alignment
+    if (!"RF" %in% names(aln@GC)) {
+        # make a reference line out of the alignment consensus
+        aln@GC$RF <- chartr(".", "-", aln@unmasked) |>
+            Biostrings::consensusString() |>
+            chartr(old = "-", new = ".")
+    } else {
+        if (grepl(mark_char, aln@GC$RF)) {
+            warning("warning:reference line of supplied alignment includes",
+                    "character", shQuote(mark_char), ".")
+        }
+    }
+    # find non-gap positions in the reference line
+    refpos <- IRanges::gaps(
+        Biostrings::matchPattern(".", aln@GC$RF),
+        start = 1,
+        end = Biostrings::nchar(aln@GC$RF)
+    )
+    
+    # find the non-gap positions which match the primers
+    mark_refpos <- IRanges::findOverlapPairs(
+        refpos,
+        IRanges::IRanges(start = start, end = end)
+    )
+    mark_refpos <- IRanges::pintersect(mark_refpos)
+    
+    aln@GC$RF <- Biostrings::replaceAt(
+        aln@GC$RF,
+        mark_refpos,
+        c(strrep(mark_char, mark_refpos@width))
+    )
+    aln
+}
+
+#' Find the location of an amplicon in an alignment
+#'
+#' @param aln
+#' (`connection`, `character` string giving a file name in Stockholm format,
+#' [`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class],
+#' [`RNAMultipleAlignment`][Biostrings::MultipleAlignment-class],
+#' [`DNAStringSet`][Biostrings::XStringSet-class],
+#' [`RNAStringSet`][Biostrings::XStringSet-class],
+#' [`StockholmMultipleAlignment`][inferrnal::StockholmMultipleAlignment-class],
+#' or `character` vector)
+#' DNA or RNA multiple alignment in which to search for an amplicon.
+#' @param fwd_primer
+#' (`character` string or [`DNAString`][Biostrings::XString-class])
+#' Forward primer sequence to define the target amplicon.
+#' @param rev_primer
+#' (`character` string or [`DNAString`][Biostrings::XString-class])
+#' Reverse primer sequence to define the target amplicon. Should be given from
+#' 5' to 3' in the primer; i.e. the reverse complement of the expected sequence
+#' in the alignment.
+#' @param trim
+#' (one of `"none"`, `"retain"`, or `"remove"`)
+#' Choice of how to trim the alignment: if `"none"` then the alignment is not
+#' trimmed; if `"retain"` then the alignment is trimmed to the amplicon,
+#' including the primer sites; if `"remove"` then the alignment is trimmed to
+#' the amplicon and the primer sites are also removed.
+#' @param mark
+#' (`logical` flag)
+#' If `TRUE` (default) the primer sites are marked in the alignment RF line.
+#' @param fwd_char
+#' (single `character`)
+#' Character to use for marking the forward primer location in the RF line.
+#' @param rev_char
+#' (single `character`)
+#' Character to use for marking the reverse primer location in the RF line.
+#' @param outfile
+#' (`character` file name or [`connection`])
+#' If non-`NULL`, an output file or connection to write the result to.
+#'
+#' @return [`StockholmMultipleAlignment`][inferrnal::StockholmMultipleAlignment-class]
+#' object with modified RF line to mark the primer locations, or if `outfile` is
+#' given, `NULL` invisibly.
+#' @export
+find_amplicon <- function(aln, fwd_primer, rev_primer,
+                          trim = c("none", "retain", "remove"),
+                          mark = TRUE, fwd_char = "{", 
+                          rev_char = "}", outfile = NULL) {
+  
+  aln <- as_StockholmMSA(aln)
+  
+  if (is.character(fwd_primer)) fwd_primer <- Biostrings::DNAString(fwd_primer)
+  assertthat::assert_that(methods::is(fwd_primer, "DNAString"))
+  if (is.character(rev_primer)) rev_primer <- Biostrings::DNAString(rev_primer)
+  assertthat::assert_that(methods::is(rev_primer, "DNAString"))
+  rev_primer <- Biostrings::reverseComplement(rev_primer)
+    
+    assertthat::assert_that(
+        assertthat::is.string(fwd_char),
+        nchar(fwd_char) == 1L
+    )
+    
+    assertthat::assert_that(
+        assertthat::is.string(rev_char),
+        nchar(rev_char) == 1L
+    )
+    
+    assertthat::assert_that(fwd_char != rev_char)
+  
+  # find gaps in the reference alignment.  both "." and "-" are gaps
+  gaps <- 
+    mapply(
+      Biostrings::union,
+      Biostrings::vmatchPattern(".", aln@unmasked),
+      Biostrings::vmatchPattern("-", aln@unmasked)
+    )
+  
+  # get the ungapped reference sequences
+  ungapped <-
+    lapply(gaps, Biostrings::gaps, start = 1, end = ncol(aln)) |>
+    mapply(
+      FUN = function(x, ranges) x[ranges],
+      x = aln@unmasked
+    )
+  if (methods::is(aln, "StockholmRNAMultipleAlignment")) {
+    ungapped <- Biostrings::RNAStringSet(ungapped)
+  }
+  ungapped <- Biostrings::DNAStringSet(ungapped)
+  
+  # find the primers in the ungapped sequences, and map positions back into the
+  # alignment
+  result_fwd <- find_primer(ungapped, gaps, fwd_primer)
+  result_rev <- find_primer(ungapped, gaps, rev_primer)
+  
+  if (isTRUE(mark)) {
+      # replace the RF line with the new primer line.
+      aln <- mark_ref_line(aln, result_fwd["start"], result_fwd["end"], fwd_char)
+      aln <- mark_ref_line(aln, result_rev["start"], result_rev["end"], rev_char)
+  }
+  if (trim == "retain") {
+      truncate_alignment(
+          aln,
+          outfile = outfile,
+          start = result_fwd["start"],
+          stop = result_rev["end"]
+      )
+  } else if (trim == "remove") {
+      truncate_alignment(
+          aln,
+          outfile = outfile,
+          start = result_fwd["end"] + 1L,
+          stop = result_rev["start"] - 1L
+      )
+  } else if (!is.null(outfile)) {
+      inferrnal::writeStockholmMultipleAlignment(aln, outfile)
+      invisible(NULL)
+  } else {
+      aln
+  }
+}
+
+modify_cm_rf <- function(infile, outfile, rf) {
+  if (assertthat::is.readable(infile)) {
+    infile <- file(infile, open = "rt")
+  }
+  assertthat::assert_that(
+    methods::is(infile, "connection"),
+    assertthat::is.string(rf)
+  )
+  if (assertthat::is.string(outfile)) {
+    file.create(outfile)
+    outfile <- file(outfile, open = "wt")
+  }
+  assertthat::assert_that(
+    methods::is(outfile, "connection")
+  )
+  rf_width <- nchar(rf)
+  
+  while (length(l <- readLines(infile, 1000L)) > 0) {
+    # make sure we have alignment mapping
+    if (any(grepl("^MAP +no", l))) {
+      stop("input CM does not have alignment mapping")
+    }
+    # if the CM didn't have an RF line before, it will when we're done with it.
+    RF_lines <- which(grepl("^RF +no"))
+    l[RF_lines] <- sub("no", "yes", l[RF_lines], fixed = TRUE)
+    
+    # modify RF characters for CM
+    MAT_lines <- which(grepl("\\[ +MAT[PRL] ", l))
+    for (i in MAT_lines) {
+      fields <- strsplit(trimws(l[i]), " +")[[1]]
+      spaces <- strsplit(l[i], "[^ ]+")[[1]]
+      type <- fields[2]
+      if (fields %in% c("MATL", "MATP")) {
+        pos <- as.integer(fields[5])
+        fields[9] <- substr(rf, pos, pos)
+      }
+      if (fields %in% c("MATR", "MATP")) {
+        pos <- as.integer(fields[6])
+        fields[10] <- substr(rf, pos, pos)
+      }
+      l[i] <- paste(c(spaces, fields)[order(c(seq_along(spaces), seq_along(fields)))], collapse = "")
+    }
+    
+    # modify RF characters for HMM
+    HMM_lines <- which(grepl(" *[1-9]\\d* +([0-9]\\.[0-9]+ +){4}[1-9][0-9]* +. +. +.", l))
+    for (i in HMM_lines) {
+      fields <- strsplit(trimws(l[i]), " +")[[1]]
+      spaces <- strsplit(l[i], "[^ ]+")[[1]]
+      pos <- fields[6]
+      fields[8] <- substr(rf, pos, pos)
+      l[i] <- paste(c(spaces, fields)[order(c(seq_along(spaces), seq_along(fields)))], collapse = "")
+    }
+    writeLines(outfile, l)
+  }
+  invisible(NULL)
+}
+
+extract_amplicon <- function(seqs, aln) {
+  
+}
@@ -160,6 +160,7 @@ protect_names <- function(seq) {
     UseMethod("protect_names")
 }
 
+#' @export
 protect_names.ShortRead <- function(seq) {
     seq_id <- as.character(ShortRead::id(seq))
     seq@id <- Biostrings::BStringSet(as.character(seq_along(seq)))
@@ -170,6 +171,7 @@ protect_names.ShortRead <- function(seq) {
     )
 }
 
+#' @export
 protect_names.default <- function(seq) {
     seq_id <- names(seq)
     names(seq) <- as.character(seq_along(seq))
@@ -180,6 +182,7 @@ protect_names.default <- function(seq) {
     )
 }
 
+#' @export
 protect_names.character <- function(seq) {
     if (length(seq) == 1 && file.exists(seq)) {
         seq <- tryCatch(
 
@@ -31,6 +31,6 @@ test_that(
     "truncate_alignment rejects bad connections",
     {
         expect_error(truncate_alignment(3, test_out, 1, 500))
-        expect_error(truncate_alignment(test_in, NULL, 1, 500))
+        expect_error(truncate_alignment(test_in, NA, 1, 500))
     }
 )
Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,7 @@ protect_names <- function(seq) {`
`160`	`160`	`UseMethod("protect_names")`
`161`	`161`	`}`
`162`	`162`
	`163`	`+#' @export`
`163`	`164`	`protect_names.ShortRead <- function(seq) {`
`164`	`165`	`seq_id <- as.character(ShortRead::id(seq))`
`165`	`166`	`seq@id <- Biostrings::BStringSet(as.character(seq_along(seq)))`
`@@ -170,6 +171,7 @@ protect_names.ShortRead <- function(seq) {`
`170`	`171`	`)`
`171`	`172`	`}`
`172`	`173`
	`174`	`+#' @export`
`173`	`175`	`protect_names.default <- function(seq) {`
`174`	`176`	`seq_id <- names(seq)`
`175`	`177`	`names(seq) <- as.character(seq_along(seq))`
`@@ -180,6 +182,7 @@ protect_names.default <- function(seq) {`
`180`	`182`	`)`
`181`	`183`	`}`
`182`	`184`
	`185`	`+#' @export`
`183`	`186`	`protect_names.character <- function(seq) {`
`184`	`187`	`if (length(seq) == 1 && file.exists(seq)) {`
`185`	`188`	`seq <- tryCatch(`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,6 @@ test_that(`
`31`	`31`	`"truncate_alignment rejects bad connections",`
`32`	`32`	`{`
`33`	`33`	`expect_error(truncate_alignment(3, test_out, 1, 500))`
`34`		`- expect_error(truncate_alignment(test_in, NULL, 1, 500))`
	`34`	`+ expect_error(truncate_alignment(test_in, NA, 1, 500))`
`35`	`35`	`}`
`36`	`36`	`)`