diff --git a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala index 201951ef4..83b172d15 100644 --- a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala +++ b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala @@ -208,7 +208,7 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] { def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases /** Returns the number of bases that are hard-clipped at the start of the sequence. */ - def leadingHardClippedBases = this.headOption.map { elem => + def leadingHardClippedBases: Int = this.headOption.map { elem => if (elem.operator == CigarOperator.H) elem.length else 0 }.getOrElse(0) @@ -216,13 +216,23 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] { def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases /** Returns the number of bases that are hard-clipped at the end of the sequence. */ - def trailingHardClippedBases = this.lastOption.map { elem => + def trailingHardClippedBases: Int = this.lastOption.map { elem => if (elem.operator == CigarOperator.H) elem.length else 0 }.getOrElse(0) /** Returns the number of bases that are clipped at the end of the sequence. */ def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases + /** Returns the alignment start position adjusted for leading soft and hard clipped bases. */ + def unclippedStart(start: Int): Int = { + start - this.iterator.takeWhile(_.operator.isClipping).map(_.length).sum + } + + /** Returns the alignment end position adjusted for trailing soft and hard clipped bases. */ + def unclippedEnd(end: Int): Int = { + end + this.reverseIterator.takeWhile(_.operator.isClipping).map(_.length).sum + } + /** Truncates the cigar based on either query or target length cutoff. */ private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = { var pos = 1 diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 623b97d76..3546353ec 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -25,6 +25,7 @@ package com.fulcrumgenomics.bam import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.alignment.Cigar import com.fulcrumgenomics.bam.api.SamOrder.Queryname import com.fulcrumgenomics.bam.api._ import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator} @@ -41,6 +42,56 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil} import java.io.Closeable import scala.math.{max, min} + +/** Class to store information about an alignment, as described in the SAM SA tag. */ +private[bam] case class AlignmentInfo(refIndex: Int, start: Int, positiveStrand: Boolean, cigar: Option[Cigar], mapq: Int, nm: Int) { + def negativeStrand: Boolean = !positiveStrand + def refName(header: SAMFileHeader): String = header.getSequence(refIndex).getSequenceName + def mapped: Boolean = cigar.isDefined + def unmapped: Boolean = !mapped + private def _cigar: Cigar = cigar.getOrElse { + throw new IllegalStateException(s"Cannot get cigar for AlignmentInfo: ${this}") + } + def end: Int = start + _cigar.lengthOnTarget - 1 + def unclippedStart: Int = _cigar.unclippedStart(start) + def unclippedEnd: Int = _cigar.unclippedEnd(end) + /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ + def toSA(header: SAMFileHeader): String = { + val strand = if (positiveStrand) '+' else '-' + val cigar = this.cigar.getOrElse("*") + f"${refName(header)},${start},${strand},${cigar},${mapq},${nm}" + } +} + +private[bam] object AlignmentInfo { + def apply(rec: SamRecord, mate: Boolean = false): AlignmentInfo = { + if (mate) { + val mateRefIndex = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex + val mateCigar = if (rec.unpaired || rec.mateUnmapped) None else Some(rec.mateCigar.getOrElse { + throw new IllegalStateException(s"Mate CIGAR (Tag 'MC') not found for $rec, consider using SetMateInformation.") + }) + // NB: mateCigar has already checked for the existence of the MC tag, so using .get here is fine + val mateStart = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateStart + val mateStrand = if (rec.unpaired || rec.mateUnmapped) true else rec.matePositiveStrand + AlignmentInfo(mateRefIndex, mateStart, mateStrand, mateCigar, rec.mapq, 0) + } else { + val refIndex = if (rec.unmapped) Int.MaxValue else rec.refIndex + val positiveStrand = rec.positiveStrand + val start = if (rec.unmapped) Int.MaxValue else rec.start + AlignmentInfo(refIndex, start, positiveStrand, Some(rec.cigar), rec.mapq, rec.getOrElse(SAMTag.NM.name(), 0)) + } + } + + def apply(sa: String, header: SAMFileHeader): AlignmentInfo = { + val parts = sa.split(",") + require(parts.length == 6, f"Could not parse SA tag: ${sa}") + val refIndex = header.getSequenceIndex(parts(0)) + AlignmentInfo(refIndex, parts(1).toInt, parts(2) == "+", Some(Cigar(parts(3))), parts(4).toInt, parts(5).toInt) + } + /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ + def toSA(rec: SamRecord): String = AlignmentInfo(rec).toSA(rec.header) +} + /** * Class that represents all reads from a template within a BAM file. */ @@ -105,16 +156,27 @@ case class Template(r1: Option[SamRecord], Template(x1, x2) } - /** Fixes mate information and sets mate cigar and mate score on all primary and supplementary (but not secondary) records. */ + + /** Fixes mate information and sets mate cigar and mate score on all primary, secondary, and supplementary records. */ def fixMateInfo(): Unit = { // Developer note: the mate score ("ms") tag is used by samtools markdup - for (primary <- r1; supp <- r2Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) - primary.get[Int]("AS").foreach(supp("ms") = _) + // Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to + // add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag! + val r1NonPrimary = r1Supplementals ++ r1Secondaries + val r2NonPrimary = r2Supplementals ++ r2Secondaries + for (primary <- r1; nonPrimary <- r2NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + primary.get[Int]("AS").foreach(nonPrimary("ms") = _) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary) + r2.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r)) } - for (primary <- r2; supp <- r1Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) - primary.get[Int]("AS").foreach(supp("ms") = _) + for (primary <- r2; nonPrimary <- r1NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + primary.get[Int]("AS").foreach(nonPrimary("ms") = _) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary) + r1.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r)) } for (first <- r1; second <- r2) { SamPairUtil.setMateInfo(first.asSam, second.asSam, true) @@ -128,6 +190,10 @@ case class Template(r1: Option[SamRecord], } object Template { + /** The local SAM tag to store the alignment information of the primary alignment (in the same format as the SA tag) */ + val ReadPrimarySamTag: String = "rp" + /** The local SAM tag to store the alignment information of the mate's primary alignment (in the same format as the SA tag) */ + val MatePrimarySamTag: String = "mp" /** * Generates a Template for the next template in the buffered iterator. Assumes that the * iterator is queryname sorted or grouped. diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala index 6aff8d844..9414e6f12 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala @@ -24,6 +24,7 @@ package com.fulcrumgenomics.bam.api +import com.fulcrumgenomics.bam.{Bams, AlignmentInfo, Template} import com.fulcrumgenomics.umi.ConsensusTags import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 @@ -175,24 +176,38 @@ object SamOrder { override val groupOrder: GroupOrder = GroupOrder.query override val subSort: Option[String] = Some("template-coordinate") override val sortkey: SamRecord => A = rec => { - val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex - val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex - val readNeg = rec.negativeStrand - val mateNeg = if (rec.paired) rec.mateNegativeStrand else false - val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart - val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) - val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") - val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + // For non-secondary/non-supplementary alignments, or unpaired or mate unmapped reads, use the info in the record. + // Otherwise, use the info in the pa/pm tags. + val primary = if (!rec.secondary && !rec.supplementary) AlignmentInfo(rec) else { + val readPrimaryTag = rec.get[String](Template.ReadPrimarySamTag).getOrElse { + throw new IllegalStateException( + s"Missing '${Template.ReadPrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation" + ) + } + AlignmentInfo(readPrimaryTag, rec.header) + } + val mate = if ((!rec.secondary && !rec.supplementary) || rec.unpaired || rec.mateUnmapped) AlignmentInfo(rec, mate=true) else { + val matePrimaryTag = rec.get[String](Template.MatePrimarySamTag).getOrElse { + throw new IllegalStateException( + s"Missing '${Template.MatePrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation" + ) + } + AlignmentInfo(matePrimaryTag, rec.header) + } + val readPos = if (primary.unmapped) Int.MaxValue else if (primary.negativeStrand) primary.unclippedEnd else primary.unclippedStart + val matePos = if (mate.unmapped) Int.MaxValue else if (mate.negativeStrand) mate.unclippedEnd else mate.unclippedStart + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => val index: Int = m.lastIndexOf('/') if (index >= 0) m.substring(0, index) else m }.getOrElse("") - if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || - (readChrom == mateChrom && readPos == matePos && !readNeg)) { - TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + if (primary.refIndex < mate.refIndex || (primary.refIndex == mate.refIndex && readPos < matePos) || + (primary.refIndex == mate.refIndex && readPos == matePos && primary.positiveStrand)) { + TemplateCoordinateKey(primary.refIndex, mate.refIndex, readPos, matePos, primary.negativeStrand, mate.negativeStrand, lib, mid, rec.name, false) } else { - TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + TemplateCoordinateKey(mate.refIndex, primary.refIndex, matePos, readPos, mate.negativeStrand, primary.negativeStrand, lib, mid, rec.name, true) } } } diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala index cf05b54c6..0002c54bc 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala @@ -76,7 +76,7 @@ class TransientAttrs(private val rec: SamRecord) { /** * A trait that fgbio uses as a replacement for [[SAMRecord]]. The trait is self-typed as a - * [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this wasy so that + * [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this ways so that * a) we can access superclass methods via [[SamRecordIntermediate]] but that self-typing here * instead of extending hides the [[SAMRecord]] API from users of the class. The result is * always a [[SAMRecord]] but isn't seen as such without casting. diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index faa563844..dabd3b73a 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -734,7 +734,7 @@ class GroupReadsByUmi // Then output the records in the right order (assigned tag, read name, r1, r2) templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => { - templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => { + templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => { out += rec }) }) diff --git a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala index 63c809fc3..a64cafc95 100644 --- a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala +++ b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala @@ -121,6 +121,39 @@ class AlignmentTest extends UnitSpec { Cigar("10M2D20M").reverse.toString shouldBe "20M2D10M" } + private case class ClippingTestCase + (cigar: String, clippedBases: Int, + leadingSoftClippedBases: Int, trailingSoftClippedBases: Int, + leadingHardClippedBases: Int, trailingHardClippedBases: Int, + leadingClippedBases: Int, trailingClippedBases: Int, + unclippedStart: Int, unclippedEnd: Int) + + Seq( + ClippingTestCase("50M", 0, 0, 0, 0, 0, 0, 0, 0, 0), // no clipping + ClippingTestCase("20S50M", 20, 20, 0, 0, 0, 20, 0, -20, 0), // leading soft clipping + ClippingTestCase("50M10S", 10, 0, 10, 0, 0, 0, 10, 0, 10), // trailing soft clipping + ClippingTestCase("20S50M10S", 30, 20, 10, 0, 0, 20, 10, -20, 10), // leading and trailing soft clipping + ClippingTestCase("20H50M", 20, 0, 0, 20, 0, 20, 0, -20, 0), // leading hard clipping + ClippingTestCase("50M10H", 10, 0, 0, 0, 10, 0, 10, 0, 10), // trailing hard clipping + ClippingTestCase("20H50M10H", 30, 0, 0, 20, 10, 20, 10, -20, 10), // leading and trailing hard clipping + ClippingTestCase("15H5S50M", 20, 5, 0, 15, 0, 20, 0, -20, 0), // leading hard-the-soft clipping + ClippingTestCase("50M5S15H", 20, 0, 5, 0, 15, 0, 20, 0, 20), // trailing hard-the-soft clipping + ClippingTestCase("20H15S50M10S5H", 50, 15, 10, 20, 5, 35, 15, -35, 15), // leading and trailing hard-the-soft clipping + ).foreach { testCase => + "Cigar clipping methods" should s"account for leading and trailing clipping in ${testCase.cigar}" in { + val cigar = Cigar(testCase.cigar) + cigar.clippedBases shouldBe testCase.clippedBases + cigar.leadingSoftClippedBases shouldBe testCase.leadingSoftClippedBases + cigar.trailingSoftClippedBases shouldBe testCase.trailingSoftClippedBases + cigar.leadingHardClippedBases shouldBe testCase.leadingHardClippedBases + cigar.trailingHardClippedBases shouldBe testCase.trailingHardClippedBases + cigar.leadingClippedBases shouldBe testCase.leadingClippedBases + cigar.trailingClippedBases shouldBe testCase.trailingClippedBases + cigar.unclippedStart(100) shouldBe testCase.unclippedStart + 100 + cigar.unclippedEnd(200) shouldBe testCase.unclippedEnd + 200 + } + } + ///////////////////////////////////////////////////////////////////////////// // Tests for the Alignment class ///////////////////////////////////////////////////////////////////////////// diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala index 2170ec351..b5eda09e2 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala @@ -25,11 +25,13 @@ package com.fulcrumgenomics.bam.api import java.util.Random - import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.bam.Bams +import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator} import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.util.Io import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator} import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 @@ -217,4 +219,30 @@ class SamOrderTest extends UnitSpec { actual should contain theSameElementsInOrderAs expected } + + it should "handle supplementary alignments" in { + val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Queryname)) + val exp = ListBuffer[SamRecord]() + // primary read pairs for q1, that map to different contigs + exp ++= builder.addPair("q1", contig=1, contig2=Some(2), start1=66, start2=47, cigar1="60M40S", cigar2="55M45S", strand2=Plus) + // supplementary R2 (ignore R1), which maps to the same chromosome as the primary R1 + val Seq(_, s2) = builder.addPair("q1", contig=1, contig2=Some(1), start1=66, start2=66, cigar1="60M40S", strand2=Minus) + s2.supplementary = true + s2.properlyPaired = true + exp += s2 + // primary read pairs for q2, that map to different contigs, but earlier that q1 + exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S") + + // Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first, + // so we cannot use the records in builder/exp. + val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template => + template.fixMateInfo() + template.allReads + }.toList + + val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup") + val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) + + actual should contain theSameElementsInOrderAs expected + } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 2fb0526bf..fbf668aa0 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -26,14 +26,15 @@ */ package com.fulcrumgenomics.umi -import com.fulcrumgenomics.bam.Template -import com.fulcrumgenomics.bam.api.SamOrder +import com.fulcrumgenomics.bam.{Bams, Template} +import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter} import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate import com.fulcrumgenomics.cmdline.FgBioMain.FailureException import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.umi.GroupReadsByUmi._ import com.fulcrumgenomics.util.Metric +import com.fulcrumgenomics.util.Io import org.scalatest.{OptionValues, PrivateMethodTester} import java.nio.file.Files @@ -344,6 +345,46 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT recs.filter(_.name.equals("a04")).forall(_.duplicate == true) shouldBe true } + it should "mark duplicates on supplementary reads" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + // primary read pairs for q1, that map to different contigs + builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT")) + // supplementary R2, which maps to the same chromosome as the primary R1 + val Seq(s1, s2) = builder.addPair("q1", contig = 1, contig2 = Some(1), start1 = 66, start2 = 66, cigar1 = "60M40S", strand2 = Minus, attrs = Map("RX" -> "ACT")) + s1.supplementary = true + s2.supplementary = true + s2.properlyPaired = true + // primary read pairs for q2, that map to different contigs, but earlier that q1 + builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) + + // primary read pairs for q3, that are duplicates of q2 + builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) + + // Fix the mate information and write the input. Note: sorting here to get a template-iterator will write the + // records to disk first, so we cannot use the records in builder and therefore must write to the input file + // directly. + val in = Files.createTempFile("raw_reads", ".sam") + val writer = SamWriter(in, builder.header, sort = builder.sort) + Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template => + template.fixMateInfo() + writer ++= template.allReads + } + writer.close() + + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true) + + gr.markDuplicates shouldBe true + gr.execute() + + val recs = readBamRecs(out) + recs.length shouldBe 8 + recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true + recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true + recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true + } + it should "correctly group reads with the paired assigner when the two UMIs are the same" in { val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate)) builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ACT"))