broadinstitute · jamesemery · Feb 11, 2019 · Nov 30, 2018 · Nov 30, 2018 · Dec 5, 2018
diff --git a/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
@@ -6,9 +6,12 @@
 import htsjdk.samtools.CigarOperator;
 import htsjdk.samtools.SAMFileHeader;
 import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.Tuple;
 import htsjdk.variant.variantcontext.*;
 import htsjdk.variant.vcf.VCFHeaderLine;
 import htsjdk.variant.vcf.VCFSimpleHeaderLine;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.engine.AlignmentContext;
 import org.broadinstitute.hellbender.engine.AssemblyRegion;
 import org.broadinstitute.hellbender.tools.walkers.genotyper.PloidyModel;
 import org.broadinstitute.hellbender.tools.walkers.variantutils.PosteriorProbabilitiesUtils;
@@ -485,21 +488,19 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
         // We are safe to use the faster no-copy versions of getBases and getBaseQualities here,
         // since we're not modifying the returned arrays in any way. This makes a small difference
         // in the HaplotypeCaller profile, since this method is a major hotspot.
-        final byte[] readBases = AlignmentUtils.getBasesAlignedOneToOne(read);  //calls getBasesNoCopy if CIGAR is all match
-        final byte[] readQuals = AlignmentUtils.getBaseQualsAlignedOneToOne(read);
+        final Pair<byte[], byte[]> readBasesAndBaseQualities = AlignmentUtils.getBasesAndBaseQualitiesAlignedOneToOne(read);  //calls getBasesNoCopy if CIGAR is all match
 
-
-        final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);
+        final int baselineMMSum = sumMismatchingQualities(readBasesAndBaseQualities.getLeft(), readBasesAndBaseQualities.getRight(), readStart, refBases, refStart, Integer.MAX_VALUE);
 
         // consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion
         // or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores
         for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) {
             // check insertions:
-            if (sumMismatchingQualities(readBases, readQuals, readStart + indelSize, refBases, refStart, baselineMMSum) <= baselineMMSum) {
+            if (sumMismatchingQualities(readBasesAndBaseQualities.getLeft(), readBasesAndBaseQualities.getRight(), readStart + indelSize, refBases, refStart, baselineMMSum) <= baselineMMSum) {
                 return false;
             }
             // check deletions:
-            if (sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart + indelSize, baselineMMSum) <= baselineMMSum) {
+            if (sumMismatchingQualities(readBasesAndBaseQualities.getLeft(), readBasesAndBaseQualities.getRight(), readStart, refBases, refStart + indelSize, baselineMMSum) <= baselineMMSum) {
                 return false;
             }
         }
@@ -548,8 +549,8 @@ int calcNIndelInformativeReads(final ReadPileup pileup, final int pileupOffsetIn
     protected int getCigarModifiedOffset (final PileupElement p){
         final GATKRead read = p.getRead();
         int offset = (p.getCurrentCigarElement().getOperator().consumesReferenceBases() || p.getCurrentCigarElement().getOperator() == CigarOperator.S)? p.getOffsetInCurrentCigar() : 0;
-        for (int i = 0; i < p.getCurrentCigarOffset(); i++ ) {
-            CigarElement elem = read.getCigarElement(i);
+        for (int i = 0; i < p.getCurrentCigarOffset(); i++) {
+            final CigarElement elem = read.getCigarElement(i);
             if (elem.getOperator().consumesReferenceBases() || elem.getOperator() == CigarOperator.S) {
                 offset += elem.getLength();
             }

diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java
@@ -3,18 +3,19 @@
 import htsjdk.samtools.Cigar;
 import htsjdk.samtools.CigarElement;
 import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.util.Tuple;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
 import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.utils.BaseUtils;
-import org.broadinstitute.hellbender.utils.Nucleotide;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
 
 import java.util.*;
-import java.util.function.Function;
 
 
 public final class AlignmentUtils {
@@ -200,48 +201,72 @@ public static byte[] getBasesCoveringRefInterval(final int refStart, final int r
         return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
     }
 
-    public static byte[] getBasesAlignedOneToOne(final GATKRead read) {
-        return getSequenceAlignedOneToOne(read, r -> r.getBasesNoCopy(), GAP_CHARACTER);
-    }
-
-    public static byte[] getBaseQualsAlignedOneToOne(final GATKRead read) {
-        return getSequenceAlignedOneToOne(read, r -> r.getBaseQualitiesNoCopy(), (byte)0);
+    /**
+     * Returns the "IGV View" of all the bases and base qualities in a read aligned to the reference according to the cigar, dropping any bases
+     * that might be in the read but aren't in the reference. Any bases that appear in the reference but not the read
+     * will be filled in with GAP_CHARACTER values for the read bases and 0's for base qualities to indicate that they don't exist.
+     *
+     * If the cigar for input read is all matches to the reference then this method will return references to the original
+     * read base/base quality byte arrays in the underlying SamRecord in order to save on array allocation/copying performance effects.
+     *
+     * @param read a read to return aligned to the reference
+     * @return A Pair of byte arrays where the left array corresponds to the bases aligned to the reference and right
+     *         array corresponds to the baseQualities aligned to the reference.
+     */
+    public static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read) {
+        return getBasesAndBaseQualitiesAlignedOneToOne(read, GAP_CHARACTER, (byte)0);
     }
 
-    public static byte[] getSequenceAlignedOneToOne(final GATKRead read, final Function<GATKRead, byte[]> bytesProvider, final byte padWith) {
+    private static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read, final byte basePadCharacter, final byte qualityPadCharacter) {
         Utils.nonNull(read);
-        Utils.nonNull(bytesProvider);
-        final Cigar cigar = read.getCigar();
-        final byte[] sequence = bytesProvider.apply(read);
-
-        if (!cigar.containsOperator(CigarOperator.DELETION) && !cigar.containsOperator(CigarOperator.INSERTION)) {
-            return sequence;
+        // As this code is performance sensitive in the HaplotypeCaller, we elect to use the noCopy versions of these getters.
+        // We can do this because we don't mutate base or quality arrays in this method or in its accessors
+        final byte[] bases = read.getBasesNoCopy();
+        final byte[] baseQualities = read.getBaseQualitiesNoCopy();
+        final int numCigarElements = read.numCigarElements();
+        boolean sawIndel = false;
+
+        // Check if the cigar contains indels
+        // Note that we don't call ContainsOperator() here twice to avoid the performance hit of building stream iterators twice
+        for (int i = 0; i < numCigarElements; i++) {
+            final CigarOperator e = read.getCigarElement(i).getOperator();
+            if (e == CigarOperator.INSERTION || e == CigarOperator.DELETION) {
+                sawIndel = true;
+                break;
+            }
+        }
+        if (!sawIndel) {
+            return new ImmutablePair<>(bases, baseQualities);
         }
         else {
-            final byte[] paddedBases = new byte[CigarUtils.countRefBasesIncludingSoftClips(read, 0, cigar.numCigarElements())];
+            final int numberRefBasesIncludingSoftclips = CigarUtils.countRefBasesIncludingSoftClips(read, 0, numCigarElements);
+            final byte[] paddedBases = new byte[numberRefBasesIncludingSoftclips];
+            final byte[] paddedBaseQualities = new byte[numberRefBasesIncludingSoftclips];
             int literalPos = 0;
             int paddedPos = 0;
-            for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
-                final CigarElement ce = cigar.getCigarElement(i);
+            for ( int i = 0; i < numCigarElements; i++ ) {
+                final CigarElement ce = read.getCigarElement(i);
                 final CigarOperator co = ce.getOperator();
                 if (co.consumesReadBases()) {
                     if (!co.consumesReferenceBases()) {
                         literalPos += ce.getLength();  //skip inserted bases
                     }
                     else {
-                        System.arraycopy(sequence, literalPos, paddedBases, paddedPos, ce.getLength());
+                        System.arraycopy(bases, literalPos, paddedBases, paddedPos, ce.getLength());
+                        System.arraycopy(baseQualities, literalPos, paddedBaseQualities, paddedPos, ce.getLength());
                         literalPos += ce.getLength();
                         paddedPos += ce.getLength();
                     }
                 }
                 else if (co.consumesReferenceBases()) {
                     for ( int j = 0; j < ce.getLength(); j++ ) {  //pad deleted bases
-                        paddedBases[paddedPos] = padWith;
+                        paddedBases[paddedPos] = basePadCharacter;
+                        paddedBaseQualities[paddedPos] = qualityPadCharacter;
                         paddedPos++;
                     }
                 }
             }
-            return paddedBases;
+            return new ImmutablePair<>(paddedBases, paddedBaseQualities);
         }
     }
 

diff --git a/src/test/java/org/broadinstitute/hellbender/utils/read/AlignmentUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/read/AlignmentUtilsUnitTest.java
@@ -2,10 +2,13 @@
 
 import htsjdk.samtools.*;
 import org.apache.commons.lang3.ArrayUtils;
+import org.apache.commons.lang3.tuple.Pair;
 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
+import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
@@ -776,6 +779,50 @@ else if ( middleOp == 'I' && i == startOfIndelBases - 1 )
         }
     }
 
+    ///////////////////////////////////////////////////////////////////
+    // Test AlignmentUtils.getBasesAndBaseQualitiesAlignedOneToOne() //
+    ///////////////////////////////////////////////////////////////////
+
+    @DataProvider
+    public Object[][] makeGetBasesAndBaseQualitiesAlignedOneToOneTest() {
+        final String readBases = "ATCGATCG";
+        List<Object[]> tests = new ArrayList<>();
+
+        { // very basic testing
+            final String cigar1 = "8M";
+            final String cigar2 = "4M4D4M";
+            final String cigar3 = "2M3I3M";
+            final String cigar4 = "2I6M";
+            final String cigar5 = "6M2I";
+            final String cigar6 = "1D8M1D";
+            final String cigar7 = "2M1I1D2M2I1M2D";
+
+            tests.add(new Object[]{readBases, cigar1, "ATCGATCG", new byte[]{10, 10, 10, 10, 10, 10, 10, 10}});
+            tests.add(new Object[]{readBases, cigar2, "ATCG----ATCG", new byte[]{10, 10, 10, 10,0,0,0,0, 10, 10, 10, 10}});
+            tests.add(new Object[]{readBases, cigar3, "ATTCG", new byte[]{10, 10, 10, 10, 10}});
+            tests.add(new Object[]{readBases, cigar4, "CGATCG", new byte[]{10, 10, 10, 10, 10, 10}});
+            tests.add(new Object[]{readBases, cigar5, "ATCGAT", new byte[]{10, 10, 10, 10, 10, 10}});
+            tests.add(new Object[]{readBases, cigar6, "-ATCGATCG-", new byte[]{0, 10, 10, 10, 10, 10, 10, 10, 10, 0}});
+            tests.add(new Object[]{readBases, cigar7, "AT-GAG--", new byte[]{10, 10, 0, 10, 10, 10, 0, 0}});
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+
+    @Test(dataProvider = "makeGetBasesAndBaseQualitiesAlignedOneToOneTest")
+    public void testGetBasesAndBaseQualitiesAlignedOneToOne(final String readBases, final String cigar, final String expectedBases, final byte[] expectedQuals ) {
+        final byte qual = (byte)10;
+        final byte[] quals = Utils.dupBytes(qual, readBases.length());
+
+        final GATKRead read = ArtificialReadUtils.createArtificialRead(readBases.getBytes(), quals, cigar);
+
+        Pair<byte[], byte[]> actual = AlignmentUtils.getBasesAndBaseQualitiesAlignedOneToOne(read);
+
+        Assert.assertEquals(new String(actual.getLeft()), expectedBases);
+        Assert.assertEquals(actual.getRight(), expectedQuals);
+    }
+
     //////////////////////////////////////////
     // Test AlignmentUtils.leftAlignIndel() //
     //////////////////////////////////////////