|
| 1 | +#' Convert range from ungapped sequence to gapped sequence |
| 2 | +#' |
| 3 | +#' @param rstart |
| 4 | +#' (`integer` scalar) |
| 5 | +#' the start point of the range |
| 6 | +#' @param rend |
| 7 | +#' (`integer` scalar) |
| 8 | +#' the end point of the range |
| 9 | +#' @param gaps |
| 10 | +#' ([`IRanges`][IRanges::IRanges-constructor] object) |
| 11 | +#' the locations of gaps in the gapped sequence. |
| 12 | +#' |
| 13 | +#' @return named `integer` of length two, with elements `"start"` and `"end"`, |
| 14 | +#' giving the coordinates in the gapped sequence which correspond to coordinates |
| 15 | +#' `"rstart"` and `"rend"` in the ungapped sequence. |
| 16 | +#' @export |
| 17 | +gap_fill <- function(rstart, rend, gaps) { |
| 18 | + i = 1L |
| 19 | + wsum = 0L |
| 20 | + preInsert <- 0L |
| 21 | + postInsert <- 0L |
| 22 | + for (i in seq_along(gaps)) { |
| 23 | + wsum <- wsum + gaps@width[i] |
| 24 | + insert <- gaps@start[i] + gaps@width[i] - 1L - wsum |
| 25 | + if (insert < rstart) { |
| 26 | + preInsert <- wsum |
| 27 | + } |
| 28 | + if (insert < rend) { |
| 29 | + postInsert <- wsum |
| 30 | + } else { |
| 31 | + break |
| 32 | + } |
| 33 | + } |
| 34 | + c( |
| 35 | + start = preInsert + rstart, |
| 36 | + end = postInsert + rend |
| 37 | + ) |
| 38 | +} |
| 39 | + |
| 40 | +#' Find the best location for a primer sequence in a gappy alignment |
| 41 | +#' |
| 42 | +#' This is a relatively simple algorithm which aligns the primer sequences to |
| 43 | +#' each sequence in the alignment, maps the aligned locations into the alignment, |
| 44 | +#' and takes the most frequent location. A warning is issued if more than 10% of |
| 45 | +#' the sequences in the alignment have non-consensus primer positions; a |
| 46 | +#' frequent cause of this is if some of the sequences are fragmentary and do not |
| 47 | +#' actually include the primer site, but in any case it is recommended to |
| 48 | +#' manually check results. |
| 49 | +#' |
| 50 | +#' @param ungapped |
| 51 | +#' ([`DNAStringSet`][Biostrings::XStringSet-class] object) |
| 52 | +#' The unaligned, gap-free sequences in the alignment. |
| 53 | +#' @param gaps |
| 54 | +#' (`list` of [`IRanges`][IRanges::IRanges-class] objects) |
| 55 | +#' The locations of gaps in each sequence of `ungapped` when aligned. |
| 56 | +#' @param primer ([`DNAString`][Biostrings::XString-class] object) |
| 57 | +#' The primer sequence to search for. It should be in the orientation which |
| 58 | +#' matches the sequences in the alignment. (I.e., reverse primer sequences |
| 59 | +#' should be reverse complemented prior to calling `find_primer()`.) |
| 60 | +#' |
| 61 | +#' @return named `integer` of length two, with elements `start` and `end` giving |
| 62 | +#' the best fit range for the primer in the alignment. |
| 63 | +#' @export |
| 64 | +find_primer <- function(ungapped, gaps, primer) { |
| 65 | + aln <- Biostrings::pairwiseAlignment( |
| 66 | + ungapped, |
| 67 | + primer, |
| 68 | + type = "local-global", |
| 69 | + substitutionMatrix = Biostrings::nucleotideSubstitutionMatrix() |
| 70 | + ) |
| 71 | + result <- mapply( |
| 72 | + gap_fill, |
| 73 | + aln@pattern@range@start, |
| 74 | + aln@pattern@range@start + aln@pattern@range@width - 1L, |
| 75 | + gaps |
| 76 | + ) |
| 77 | + start <- result[1,] |
| 78 | + end <- result[2,] |
| 79 | + bestscore <- max(aln@score) |
| 80 | + beststart <- as.integer(names(which.max(table(start)))) |
| 81 | + bestend <- as.integer(names(which.max(table(end)))) |
| 82 | + startmismatch <- start != beststart |
| 83 | + endmismatch <- end != bestend |
| 84 | + if (sum(startmismatch | endmismatch) > 0.1 * length(start)) |
| 85 | + warning("more than 10% of sequences in reference alignment have variant\n", |
| 86 | + "locations for primer ", as.character(primer)) |
| 87 | + c(start = beststart, end = bestend, score = bestscore) |
| 88 | +} |
| 89 | + |
| 90 | +as_StockholmMSA <- function(aln, name = "aln") { |
| 91 | + if (methods::is(aln, "StockholmDNAMultipleAlignment") || |
| 92 | + methods::is(aln, "StockholmRNAMultipleAlignment")) { |
| 93 | + # ok, no problem |
| 94 | + aln |
| 95 | + } else if (methods::is(aln, "connection")) { |
| 96 | + assertthat::assert_that(summary(aln)[["can read"]] == "yes") |
| 97 | + inferrnal::read_stockholm_msa(aln) |
| 98 | + } else if (is.character(aln) && |
| 99 | + length(aln) == 1 && |
| 100 | + assertthat::is.readable(aln)) { |
| 101 | + inferrnal::read_stockholm_msa(aln) |
| 102 | + } else if (is.character(aln)) { |
| 103 | + # alignment given as character string |
| 104 | + assertthat::assert_that( |
| 105 | + dplyr::n_distinct(nchar(aln)) == 1 |
| 106 | + ) |
| 107 | + tryCatch( |
| 108 | + inferrnal::StockholmRNAMultipleAlignment(aln), |
| 109 | + error = function(e) inferrnal::StockholmRNAMultipleAlignment(aln) |
| 110 | + ) |
| 111 | + } else if (methods::is(aln, "DNAStringSet") || |
| 112 | + methods::is(aln, "DNAMultipleAlignment")) { |
| 113 | + inferrnal::StockholmDNAMultipleAlignment(aln) |
| 114 | + } else if (methods::is(aln, "RNAStringSet") || |
| 115 | + methods::is(aln, "RNAMultipleAlignment")) { |
| 116 | + inferrnal::StockholmRNAMultipleAlignment(aln) |
| 117 | + } else { |
| 118 | + stop("'", name, "' should be a connection, a filename, or a DNA or RNA alignment") |
| 119 | + } |
| 120 | +} |
| 121 | + |
| 122 | +mark_ref_line <- function(aln, start, end, mark_char) { |
| 123 | + # ensure there is a valid reference line in the alignment |
| 124 | + if (!"RF" %in% names(aln@GC)) { |
| 125 | + # make a reference line out of the alignment consensus |
| 126 | + aln@GC$RF <- chartr(".", "-", aln@unmasked) |> |
| 127 | + Biostrings::consensusString() |> |
| 128 | + chartr(old = "-", new = ".") |
| 129 | + } else { |
| 130 | + if (grepl(mark_char, aln@GC$RF)) { |
| 131 | + warning("warning:reference line of supplied alignment includes", |
| 132 | + "character", shQuote(mark_char), ".") |
| 133 | + } |
| 134 | + } |
| 135 | + # find non-gap positions in the reference line |
| 136 | + refpos <- IRanges::gaps( |
| 137 | + Biostrings::matchPattern(".", aln@GC$RF), |
| 138 | + start = 1, |
| 139 | + end = Biostrings::nchar(aln@GC$RF) |
| 140 | + ) |
| 141 | + |
| 142 | + # find the non-gap positions which match the primers |
| 143 | + mark_refpos <- IRanges::findOverlapPairs( |
| 144 | + refpos, |
| 145 | + IRanges::IRanges(start = start, end = end) |
| 146 | + ) |
| 147 | + mark_refpos <- IRanges::pintersect(mark_refpos) |
| 148 | + |
| 149 | + aln@GC$RF <- Biostrings::replaceAt( |
| 150 | + aln@GC$RF, |
| 151 | + mark_refpos, |
| 152 | + c(strrep(mark_char, mark_refpos@width)) |
| 153 | + ) |
| 154 | + aln |
| 155 | +} |
| 156 | + |
| 157 | +#' Find the location of an amplicon in an alignment |
| 158 | +#' |
| 159 | +#' @param aln |
| 160 | +#' (`connection`, `character` string giving a file name in Stockholm format, |
| 161 | +#' [`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class], |
| 162 | +#' [`RNAMultipleAlignment`][Biostrings::MultipleAlignment-class], |
| 163 | +#' [`DNAStringSet`][Biostrings::XStringSet-class], |
| 164 | +#' [`RNAStringSet`][Biostrings::XStringSet-class], |
| 165 | +#' [`StockholmMultipleAlignment`][inferrnal::StockholmMultipleAlignment-class], |
| 166 | +#' or `character` vector) |
| 167 | +#' DNA or RNA multiple alignment in which to search for an amplicon. |
| 168 | +#' @param fwd_primer |
| 169 | +#' (`character` string or [`DNAString`][Biostrings::XString-class]) |
| 170 | +#' Forward primer sequence to define the target amplicon. |
| 171 | +#' @param rev_primer |
| 172 | +#' (`character` string or [`DNAString`][Biostrings::XString-class]) |
| 173 | +#' Reverse primer sequence to define the target amplicon. Should be given from |
| 174 | +#' 5' to 3' in the primer; i.e. the reverse complement of the expected sequence |
| 175 | +#' in the alignment. |
| 176 | +#' @param trim |
| 177 | +#' (one of `"none"`, `"retain"`, or `"remove"`) |
| 178 | +#' Choice of how to trim the alignment: if `"none"` then the alignment is not |
| 179 | +#' trimmed; if `"retain"` then the alignment is trimmed to the amplicon, |
| 180 | +#' including the primer sites; if `"remove"` then the alignment is trimmed to |
| 181 | +#' the amplicon and the primer sites are also removed. |
| 182 | +#' @param mark |
| 183 | +#' (`logical` flag) |
| 184 | +#' If `TRUE` (default) the primer sites are marked in the alignment RF line. |
| 185 | +#' @param fwd_char |
| 186 | +#' (single `character`) |
| 187 | +#' Character to use for marking the forward primer location in the RF line. |
| 188 | +#' @param rev_char |
| 189 | +#' (single `character`) |
| 190 | +#' Character to use for marking the reverse primer location in the RF line. |
| 191 | +#' @param outfile |
| 192 | +#' (`character` file name or [`connection`]) |
| 193 | +#' If non-`NULL`, an output file or connection to write the result to. |
| 194 | +#' |
| 195 | +#' @return [`StockholmMultipleAlignment`][inferrnal::StockholmMultipleAlignment-class] |
| 196 | +#' object with modified RF line to mark the primer locations, or if `outfile` is |
| 197 | +#' given, `NULL` invisibly. |
| 198 | +#' @export |
| 199 | +find_amplicon <- function(aln, fwd_primer, rev_primer, |
| 200 | + trim = c("none", "retain", "remove"), |
| 201 | + mark = TRUE, fwd_char = "{", |
| 202 | + rev_char = "}", outfile = NULL) { |
| 203 | + |
| 204 | + aln <- as_StockholmMSA(aln) |
| 205 | + |
| 206 | + if (is.character(fwd_primer)) fwd_primer <- Biostrings::DNAString(fwd_primer) |
| 207 | + assertthat::assert_that(methods::is(fwd_primer, "DNAString")) |
| 208 | + if (is.character(rev_primer)) rev_primer <- Biostrings::DNAString(rev_primer) |
| 209 | + assertthat::assert_that(methods::is(rev_primer, "DNAString")) |
| 210 | + rev_primer <- Biostrings::reverseComplement(rev_primer) |
| 211 | + |
| 212 | + assertthat::assert_that( |
| 213 | + assertthat::is.string(fwd_char), |
| 214 | + nchar(fwd_char) == 1L |
| 215 | + ) |
| 216 | + |
| 217 | + assertthat::assert_that( |
| 218 | + assertthat::is.string(rev_char), |
| 219 | + nchar(rev_char) == 1L |
| 220 | + ) |
| 221 | + |
| 222 | + assertthat::assert_that(fwd_char != rev_char) |
| 223 | + |
| 224 | + # find gaps in the reference alignment. both "." and "-" are gaps |
| 225 | + gaps <- |
| 226 | + mapply( |
| 227 | + Biostrings::union, |
| 228 | + Biostrings::vmatchPattern(".", aln@unmasked), |
| 229 | + Biostrings::vmatchPattern("-", aln@unmasked) |
| 230 | + ) |
| 231 | + |
| 232 | + # get the ungapped reference sequences |
| 233 | + ungapped <- |
| 234 | + lapply(gaps, Biostrings::gaps, start = 1, end = ncol(aln)) |> |
| 235 | + mapply( |
| 236 | + FUN = function(x, ranges) x[ranges], |
| 237 | + x = aln@unmasked |
| 238 | + ) |
| 239 | + if (methods::is(aln, "StockholmRNAMultipleAlignment")) { |
| 240 | + ungapped <- Biostrings::RNAStringSet(ungapped) |
| 241 | + } |
| 242 | + ungapped <- Biostrings::DNAStringSet(ungapped) |
| 243 | + |
| 244 | + # find the primers in the ungapped sequences, and map positions back into the |
| 245 | + # alignment |
| 246 | + result_fwd <- find_primer(ungapped, gaps, fwd_primer) |
| 247 | + result_rev <- find_primer(ungapped, gaps, rev_primer) |
| 248 | + |
| 249 | + if (isTRUE(mark)) { |
| 250 | + # replace the RF line with the new primer line. |
| 251 | + aln <- mark_ref_line(aln, result_fwd["start"], result_fwd["end"], fwd_char) |
| 252 | + aln <- mark_ref_line(aln, result_rev["start"], result_rev["end"], rev_char) |
| 253 | + } |
| 254 | + if (trim == "retain") { |
| 255 | + truncate_alignment( |
| 256 | + aln, |
| 257 | + outfile = outfile, |
| 258 | + start = result_fwd["start"], |
| 259 | + stop = result_rev["end"] |
| 260 | + ) |
| 261 | + } else if (trim == "remove") { |
| 262 | + truncate_alignment( |
| 263 | + aln, |
| 264 | + outfile = outfile, |
| 265 | + start = result_fwd["end"] + 1L, |
| 266 | + stop = result_rev["start"] - 1L |
| 267 | + ) |
| 268 | + } else if (!is.null(outfile)) { |
| 269 | + inferrnal::writeStockholmMultipleAlignment(aln, outfile) |
| 270 | + invisible(NULL) |
| 271 | + } else { |
| 272 | + aln |
| 273 | + } |
| 274 | +} |
| 275 | + |
| 276 | +modify_cm_rf <- function(infile, outfile, rf) { |
| 277 | + if (assertthat::is.readable(infile)) { |
| 278 | + infile <- file(infile, open = "rt") |
| 279 | + } |
| 280 | + assertthat::assert_that( |
| 281 | + methods::is(infile, "connection"), |
| 282 | + assertthat::is.string(rf) |
| 283 | + ) |
| 284 | + if (assertthat::is.string(outfile)) { |
| 285 | + file.create(outfile) |
| 286 | + outfile <- file(outfile, open = "wt") |
| 287 | + } |
| 288 | + assertthat::assert_that( |
| 289 | + methods::is(outfile, "connection") |
| 290 | + ) |
| 291 | + rf_width <- nchar(rf) |
| 292 | + |
| 293 | + while (length(l <- readLines(infile, 1000L)) > 0) { |
| 294 | + # make sure we have alignment mapping |
| 295 | + if (any(grepl("^MAP +no", l))) { |
| 296 | + stop("input CM does not have alignment mapping") |
| 297 | + } |
| 298 | + # if the CM didn't have an RF line before, it will when we're done with it. |
| 299 | + RF_lines <- which(grepl("^RF +no")) |
| 300 | + l[RF_lines] <- sub("no", "yes", l[RF_lines], fixed = TRUE) |
| 301 | + |
| 302 | + # modify RF characters for CM |
| 303 | + MAT_lines <- which(grepl("\\[ +MAT[PRL] ", l)) |
| 304 | + for (i in MAT_lines) { |
| 305 | + fields <- strsplit(trimws(l[i]), " +")[[1]] |
| 306 | + spaces <- strsplit(l[i], "[^ ]+")[[1]] |
| 307 | + type <- fields[2] |
| 308 | + if (fields %in% c("MATL", "MATP")) { |
| 309 | + pos <- as.integer(fields[5]) |
| 310 | + fields[9] <- substr(rf, pos, pos) |
| 311 | + } |
| 312 | + if (fields %in% c("MATR", "MATP")) { |
| 313 | + pos <- as.integer(fields[6]) |
| 314 | + fields[10] <- substr(rf, pos, pos) |
| 315 | + } |
| 316 | + l[i] <- paste(c(spaces, fields)[order(c(seq_along(spaces), seq_along(fields)))], collapse = "") |
| 317 | + } |
| 318 | + |
| 319 | + # modify RF characters for HMM |
| 320 | + HMM_lines <- which(grepl(" *[1-9]\\d* +([0-9]\\.[0-9]+ +){4}[1-9][0-9]* +. +. +.", l)) |
| 321 | + for (i in HMM_lines) { |
| 322 | + fields <- strsplit(trimws(l[i]), " +")[[1]] |
| 323 | + spaces <- strsplit(l[i], "[^ ]+")[[1]] |
| 324 | + pos <- fields[6] |
| 325 | + fields[8] <- substr(rf, pos, pos) |
| 326 | + l[i] <- paste(c(spaces, fields)[order(c(seq_along(spaces), seq_along(fields)))], collapse = "") |
| 327 | + } |
| 328 | + writeLines(outfile, l) |
| 329 | + } |
| 330 | + invisible(NULL) |
| 331 | +} |
| 332 | + |
| 333 | +extract_amplicon <- function(seqs, aln) { |
| 334 | + |
| 335 | +} |
0 commit comments