diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index 3f658d013..deb38f3a6 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -36,17 +36,26 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} """ |Copies the UMI at the end of the BAM's read name to the RX tag. | - |The read name is split on `:` characters with the last field is assumed to be the UMI sequence. The UMI + |The read name is split on `:` characters with the last field assumed to be the UMI sequence. The UMI |will be copied to the `RX` tag as per the SAM specification. If any read does not have a UMI composed of |valid bases (ACGTN), the program will report the error and fail. | - |If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The - |resulting UMI in the `RX` tag will always be hyphen delimited. + |If a read name contains multiple UMIs they may be delimited (typically by a hyphen (`-`) or plus (`+`)). + |The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag + |will always be hyphen delimited. + | + |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add an 'r' prefix to indicate that the sequence + |has been reverse-complemented. By default, the 'r' prefix is removed and the sequence is reverse-complemented + |back to the forward orientation. The `--override-reverse-complement-umis` disables the latter behavior, such that + |the 'r' prefix is removed but the UMI sequence is left as reverse-complemented. """) class CopyUmiFromReadName -( @arg(flag='i', doc="The input BAM file") input: PathToBam, - @arg(flag='o', doc="The output BAM file") output: PathToBam, - @arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false +( @arg(flag='i', doc="The input BAM file.") input: PathToBam, + @arg(flag='o', doc="The output BAM file.") output: PathToBam, + @arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false, + @arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', + @arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', + @arg(doc="Do not reverse-complement UMIs prefixed with 'r'.") overrideReverseComplementUmis: Boolean = false, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) @@ -58,7 +67,14 @@ class CopyUmiFromReadName val progress = new ProgressLogger(logger) source.foreach { rec => progress.record(rec) - writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi) + + writer += Umis.copyUmiFromReadName( + rec = rec, + removeUmi = removeUmi, + fieldDelimiter = fieldDelimiter, + umiDelimiter = umiDelimiter, + reverseComplementPrefixedUmis = !overrideReverseComplementUmis, + ) } progress.logLast() source.safelyClose() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 2fbd4cfe9..08369a682 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -26,8 +26,10 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.util.Sequences object Umis { + val RevcompPrefix: String = "r" /** Copies the UMI sequence from the read name. * @@ -38,17 +40,29 @@ object Umis { * * @param rec the record to modify * @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag - * @param delimiter the delimiter of fields within the read name + * @param fieldDelimiter the delimiter of fields within the read name + * @param umiDelimiter the delimiter between sequences in the UMI string + * @param reverseComplementPrefixedUmis whether to reverse-compliment UMIs prefixed with 'r' * @return the modified record */ - def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':'): SamRecord = { + def copyUmiFromReadName(rec: SamRecord, + removeUmi: Boolean = false, + fieldDelimiter: Char = ':', + umiDelimiter: Char = '+', + reverseComplementPrefixedUmis: Boolean = true): SamRecord = { // Extract and set the UMI - val umi = extractUmisFromReadName(rec.name, delimiter, strict=false) + val umi = extractUmisFromReadName( + name = rec.name, + fieldDelimiter = fieldDelimiter, + strict = false, + umiDelimiter = umiDelimiter, + reverseComplementPrefixedUmis = reverseComplementPrefixedUmis, + ) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) // Remove the UMI from the read name if requested - if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(delimiter)) + if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(fieldDelimiter)) rec } @@ -59,29 +73,51 @@ object Umis { * * See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm * The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI - * field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be - * translated to hyphens before returning. + * field may contain multiple UMIs, in which case they will delimit them with `umiDelimiter` characters, which + * will be translated to hyphens before returning. * * If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments, with the UMI being the last in the case of 8 and `None` in the case of 7. * * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. */ - def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean): Option[String] = { + def extractUmisFromReadName(name: String, + fieldDelimiter: Char = ':', + strict: Boolean, + umiDelimiter: Char = '+', + reverseComplementPrefixedUmis: Boolean = true): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { - val colons = name.count(_ == delimiter) + val colons = name.count(_ == fieldDelimiter) if (colons == 6) None - else if (colons == 7) Some(name.substring(name.lastIndexOf(delimiter) + 1, name.length)) + else if (colons == 7) Some(name.substring(name.lastIndexOf(fieldDelimiter) + 1, name.length)) else throw new IllegalArgumentException(s"Trying to extract UMI from read with ${colons + 1} parts (7-8 expected): ${name}") } else { - val idx = name.lastIndexOf(delimiter) - require(idx != -1, s"Read did not have multiple '${delimiter}'-separated fields: ${name}") + val idx = name.lastIndexOf(fieldDelimiter) + require(idx != -1, s"Read did not have multiple '${fieldDelimiter}'-separated fields: ${name}") Some(name.substring(idx + 1, name.length)) } - val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase) - val valid = umi.forall(u => u.forall(isValidUmiCharacter)) + // Remove 'r' prefixes, optionally reverse-complementing the prefixed UMIs if + // reverseComplementPrefixedUmis = true, replace the delimiter (if any) with '-', + // and make sure the sequence is upper-case. + var umi = rawUmi.map(raw => + (raw.indexOf(RevcompPrefix) >= 0, raw.indexOf(umiDelimiter) > 0) match { + case (true, true) if reverseComplementPrefixedUmis => + raw.split(umiDelimiter).map(seq => + if (seq.startsWith(RevcompPrefix)) Sequences.revcomp(seq.stripPrefix(RevcompPrefix)) + else seq.stripPrefix(RevcompPrefix) + ).mkString("-").toUpperCase + case (true, false) if reverseComplementPrefixedUmis => + Sequences.revcomp(raw.stripPrefix(RevcompPrefix)).toUpperCase + case (true, true) => raw.replace(RevcompPrefix, "").replace(umiDelimiter, '-').toUpperCase + case (true, false) => raw.replace(RevcompPrefix, "").toUpperCase + case (false, true) => raw.replace(umiDelimiter, '-').toUpperCase + case (false, false) => raw.toUpperCase + } + ) + + val valid = umi.forall(u => u.forall(isValidUmiCharacter)) if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}") else if (!valid) None diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 0f17dc0a8..2e4eb14de 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -33,14 +33,21 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { private case class Result(name: String, umi: String) /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ - private def run(names: Iterable[String], removeUmi: Boolean): IndexedSeq[Result] = { + private def run(names: Iterable[String], + removeUmi: Boolean, + overrideReverseComplementUmis: Boolean = false): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } // run the tool val out = makeTempFile("test.", ".bam") - val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi) + val tool = new CopyUmiFromReadName( + input = builder.toTempFile(), + output = out, + removeUmi = removeUmi, + overrideReverseComplementUmis = overrideReverseComplementUmis + ) executeFgbioTool(tool) // slurp the results @@ -69,4 +76,26 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") } + + it should "remove a reverse-complement prefix to the UMI" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run( + names = names, + removeUmi = true, + overrideReverseComplementUmis = true + ) + results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") + results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") + } + + it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run( + names = names, + removeUmi = true, + overrideReverseComplementUmis = false + ) + results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") + results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC") + } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 96298b246..3d8c010b3 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -92,9 +92,9 @@ class UmisTest extends UnitSpec with OptionValues { } it should "split on a different name delimiter if specified" in { - copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A") - copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A") - copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT") + copyUmiFromReadName(rec=rec("UMI-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-A", "A") + copyUmiFromReadName(rec=rec("UMI-C-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A") + copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT") } it should "change the UMI delimiter from + to -" in {