Skip to content

Commit

Permalink
feat: handle reverse-complemented UMIs in CopyUmiFromReadName (#958)
Browse files Browse the repository at this point in the history
Co-authored-by: jdidion <github@didion.net>
Co-authored-by: Nils Homer <nh13@users.noreply.github.com>
  • Loading branch information
3 people authored Jul 15, 2024
1 parent fc8e1c6 commit 4b862fc
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 25 deletions.
30 changes: 23 additions & 7 deletions src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,26 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger}
"""
|Copies the UMI at the end of the BAM's read name to the RX tag.
|
|The read name is split on `:` characters with the last field is assumed to be the UMI sequence. The UMI
|The read name is split on `:` characters with the last field assumed to be the UMI sequence. The UMI
|will be copied to the `RX` tag as per the SAM specification. If any read does not have a UMI composed of
|valid bases (ACGTN), the program will report the error and fail.
|
|If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The
|resulting UMI in the `RX` tag will always be hyphen delimited.
|If a read name contains multiple UMIs they may be delimited (typically by a hyphen (`-`) or plus (`+`)).
|The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag
|will always be hyphen delimited.
|
|Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add an 'r' prefix to indicate that the sequence
|has been reverse-complemented. By default, the 'r' prefix is removed and the sequence is reverse-complemented
|back to the forward orientation. The `--override-reverse-complement-umis` disables the latter behavior, such that
|the 'r' prefix is removed but the UMI sequence is left as reverse-complemented.
""")
class CopyUmiFromReadName
( @arg(flag='i', doc="The input BAM file") input: PathToBam,
@arg(flag='o', doc="The output BAM file") output: PathToBam,
@arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false
( @arg(flag='i', doc="The input BAM file.") input: PathToBam,
@arg(flag='o', doc="The output BAM file.") output: PathToBam,
@arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false,
@arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':',
@arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+',
@arg(doc="Do not reverse-complement UMIs prefixed with 'r'.") overrideReverseComplementUmis: Boolean = false,
) extends FgBioTool with LazyLogging {

Io.assertReadable(input)
Expand All @@ -58,7 +67,14 @@ class CopyUmiFromReadName
val progress = new ProgressLogger(logger)
source.foreach { rec =>
progress.record(rec)
writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi)

writer += Umis.copyUmiFromReadName(
rec = rec,
removeUmi = removeUmi,
fieldDelimiter = fieldDelimiter,
umiDelimiter = umiDelimiter,
reverseComplementPrefixedUmis = !overrideReverseComplementUmis,
)
}
progress.logLast()
source.safelyClose()
Expand Down
62 changes: 49 additions & 13 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.util.Sequences

object Umis {
val RevcompPrefix: String = "r"

/** Copies the UMI sequence from the read name.
*
Expand All @@ -38,17 +40,29 @@ object Umis {
*
* @param rec the record to modify
* @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag
* @param delimiter the delimiter of fields within the read name
* @param fieldDelimiter the delimiter of fields within the read name
* @param umiDelimiter the delimiter between sequences in the UMI string
* @param reverseComplementPrefixedUmis whether to reverse-compliment UMIs prefixed with 'r'
* @return the modified record
*/
def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':'): SamRecord = {
def copyUmiFromReadName(rec: SamRecord,
removeUmi: Boolean = false,
fieldDelimiter: Char = ':',
umiDelimiter: Char = '+',
reverseComplementPrefixedUmis: Boolean = true): SamRecord = {
// Extract and set the UMI
val umi = extractUmisFromReadName(rec.name, delimiter, strict=false)
val umi = extractUmisFromReadName(
name = rec.name,
fieldDelimiter = fieldDelimiter,
strict = false,
umiDelimiter = umiDelimiter,
reverseComplementPrefixedUmis = reverseComplementPrefixedUmis,
)
require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}")
umi.foreach(u => rec(ConsensusTags.UmiBases) = u)

// Remove the UMI from the read name if requested
if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(delimiter))
if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(fieldDelimiter))

rec
}
Expand All @@ -59,29 +73,51 @@ object Umis {
*
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
* field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be
* translated to hyphens before returning.
* field may contain multiple UMIs, in which case they will delimit them with `umiDelimiter` characters, which
* will be translated to hyphens before returning.
*
* If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments,
with the UMI being the last in the case of 8 and `None` in the case of 7.
*
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
*/
def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean): Option[String] = {
def extractUmisFromReadName(name: String,
fieldDelimiter: Char = ':',
strict: Boolean,
umiDelimiter: Char = '+',
reverseComplementPrefixedUmis: Boolean = true): Option[String] = {
// If strict, check that the read name actually has eight parts, which is expected
val rawUmi = if (strict) {
val colons = name.count(_ == delimiter)
val colons = name.count(_ == fieldDelimiter)
if (colons == 6) None
else if (colons == 7) Some(name.substring(name.lastIndexOf(delimiter) + 1, name.length))
else if (colons == 7) Some(name.substring(name.lastIndexOf(fieldDelimiter) + 1, name.length))
else throw new IllegalArgumentException(s"Trying to extract UMI from read with ${colons + 1} parts (7-8 expected): ${name}")
} else {
val idx = name.lastIndexOf(delimiter)
require(idx != -1, s"Read did not have multiple '${delimiter}'-separated fields: ${name}")
val idx = name.lastIndexOf(fieldDelimiter)
require(idx != -1, s"Read did not have multiple '${fieldDelimiter}'-separated fields: ${name}")
Some(name.substring(idx + 1, name.length))
}

val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
val valid = umi.forall(u => u.forall(isValidUmiCharacter))
// Remove 'r' prefixes, optionally reverse-complementing the prefixed UMIs if
// reverseComplementPrefixedUmis = true, replace the delimiter (if any) with '-',
// and make sure the sequence is upper-case.
var umi = rawUmi.map(raw =>
(raw.indexOf(RevcompPrefix) >= 0, raw.indexOf(umiDelimiter) > 0) match {
case (true, true) if reverseComplementPrefixedUmis =>
raw.split(umiDelimiter).map(seq =>
if (seq.startsWith(RevcompPrefix)) Sequences.revcomp(seq.stripPrefix(RevcompPrefix))
else seq.stripPrefix(RevcompPrefix)
).mkString("-").toUpperCase
case (true, false) if reverseComplementPrefixedUmis =>
Sequences.revcomp(raw.stripPrefix(RevcompPrefix)).toUpperCase
case (true, true) => raw.replace(RevcompPrefix, "").replace(umiDelimiter, '-').toUpperCase
case (true, false) => raw.replace(RevcompPrefix, "").toUpperCase
case (false, true) => raw.replace(umiDelimiter, '-').toUpperCase
case (false, false) => raw.toUpperCase
}
)

val valid = umi.forall(u => u.forall(isValidUmiCharacter))

if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}")
else if (!valid) None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,21 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
private case class Result(name: String, umi: String)

/** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */
private def run(names: Iterable[String], removeUmi: Boolean): IndexedSeq[Result] = {
private def run(names: Iterable[String],
removeUmi: Boolean,
overrideReverseComplementUmis: Boolean = false): IndexedSeq[Result] = {
// build the reads
val builder = new SamBuilder()
names.foreach { name => builder.addFrag(name=name, unmapped=true) }

// run the tool
val out = makeTempFile("test.", ".bam")
val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi)
val tool = new CopyUmiFromReadName(
input = builder.toTempFile(),
output = out,
removeUmi = removeUmi,
overrideReverseComplementUmis = overrideReverseComplementUmis
)
executeFgbioTool(tool)

// slurp the results
Expand Down Expand Up @@ -69,4 +76,26 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
}

it should "remove a reverse-complement prefix to the UMI" in {
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
val results = run(
names = names,
removeUmi = true,
overrideReverseComplementUmis = true
)
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
}

it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI" in {
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
val results = run(
names = names,
removeUmi = true,
overrideReverseComplementUmis = false
)
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC")
}
}
6 changes: 3 additions & 3 deletions src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ class UmisTest extends UnitSpec with OptionValues {
}

it should "split on a different name delimiter if specified" in {
copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
copyUmiFromReadName(rec=rec("UMI-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
}

it should "change the UMI delimiter from + to -" in {
Expand Down

0 comments on commit 4b862fc

Please sign in to comment.