Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduced some of the repeated steps in ReferenceConfidenceModel.calcNIndelinformativeReads #5469

Merged
merged 8 commits into from
Feb 11, 2019
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.Tuple;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any advantage to using Tuple over Pair?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huh, I'm not sure what Pair implementation you are talking about, we have one in gatk that is specific to MarkDuplicates that should probably be renamed to be less confusing anyway...

I think the different tuple implementations are mostly interchangeable but this one happens to live in htsjdk so that is generally a dependency plus.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was talking about org.apache.commons.lang3.tuple.Pair. I've been using it a lot in Funcotator.

import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.engine.AssemblyRegion;
import org.broadinstitute.hellbender.tools.walkers.genotyper.PloidyModel;
import org.broadinstitute.hellbender.tools.walkers.variantutils.PosteriorProbabilitiesUtils;
Expand Down Expand Up @@ -485,21 +488,19 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
// We are safe to use the faster no-copy versions of getBases and getBaseQualities here,
// since we're not modifying the returned arrays in any way. This makes a small difference
// in the HaplotypeCaller profile, since this method is a major hotspot.
final byte[] readBases = AlignmentUtils.getBasesAlignedOneToOne(read); //calls getBasesNoCopy if CIGAR is all match
final byte[] readQuals = AlignmentUtils.getBaseQualsAlignedOneToOne(read);
final Pair<byte[], byte[]> readBasesAndBaseQualities = AlignmentUtils.getBasesAndBaseQualitiesAlignedOneToOne(read); //calls getBasesNoCopy if CIGAR is all match


final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);
final int baselineMMSum = sumMismatchingQualities(readBasesAndBaseQualities.getLeft(), readBasesAndBaseQualities.getRight(), readStart, refBases, refStart, Integer.MAX_VALUE);

// consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion
// or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores
for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) {
// check insertions:
if (sumMismatchingQualities(readBases, readQuals, readStart + indelSize, refBases, refStart, baselineMMSum) <= baselineMMSum) {
if (sumMismatchingQualities(readBasesAndBaseQualities.getLeft(), readBasesAndBaseQualities.getRight(), readStart + indelSize, refBases, refStart, baselineMMSum) <= baselineMMSum) {
return false;
}
// check deletions:
if (sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart + indelSize, baselineMMSum) <= baselineMMSum) {
if (sumMismatchingQualities(readBasesAndBaseQualities.getLeft(), readBasesAndBaseQualities.getRight(), readStart, refBases, refStart + indelSize, baselineMMSum) <= baselineMMSum) {
return false;
}
}
Expand Down Expand Up @@ -548,8 +549,8 @@ int calcNIndelInformativeReads(final ReadPileup pileup, final int pileupOffsetIn
protected int getCigarModifiedOffset (final PileupElement p){
final GATKRead read = p.getRead();
int offset = (p.getCurrentCigarElement().getOperator().consumesReferenceBases() || p.getCurrentCigarElement().getOperator() == CigarOperator.S)? p.getOffsetInCurrentCigar() : 0;
for (int i = 0; i < p.getCurrentCigarOffset(); i++ ) {
CigarElement elem = read.getCigarElement(i);
for (int i = 0; i < p.getCurrentCigarOffset(); i++) {
final CigarElement elem = read.getCigarElement(i);
if (elem.getOperator().consumesReferenceBases() || elem.getOperator() == CigarOperator.S) {
offset += elem.getLength();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.util.Tuple;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;

import java.util.*;
import java.util.function.Function;


public final class AlignmentUtils {
Expand Down Expand Up @@ -200,48 +201,72 @@ public static byte[] getBasesCoveringRefInterval(final int refStart, final int r
return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
}

public static byte[] getBasesAlignedOneToOne(final GATKRead read) {
return getSequenceAlignedOneToOne(read, r -> r.getBasesNoCopy(), GAP_CHARACTER);
}

public static byte[] getBaseQualsAlignedOneToOne(final GATKRead read) {
return getSequenceAlignedOneToOne(read, r -> r.getBaseQualitiesNoCopy(), (byte)0);
/**
* Returns the "IGV View" of all the bases and base qualities in a read aligned to the reference according to the cigar, dropping any bases
* that might be in the read but aren't in the reference. Any bases that appear in the reference but not the read
* will be filled in with GAP_CHARACTER values for the read bases and 0's for base qualities to indicate that they don't exist.
*
* If the cigar for input read is all matches to the reference then this method will return references to the original
* read base/base quality byte arrays in the underlying SamRecord in order to save on array allocation/copying performance effects.
*
* @param read a read to return aligned to the reference
* @return A Pair of byte arrays where the left array corresponds to the bases aligned to the reference and right
* array corresponds to the baseQualities aligned to the reference.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Description of return value is out of date -- it returns a Pair, not a Tuple.

*/
public static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read) {
return getBasesAndBaseQualitiesAlignedOneToOne(read, GAP_CHARACTER, (byte)0);
}

public static byte[] getSequenceAlignedOneToOne(final GATKRead read, final Function<GATKRead, byte[]> bytesProvider, final byte padWith) {
private static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read, final byte basePadCharacter, final byte qualityPadCharacter) {
Utils.nonNull(read);
Utils.nonNull(bytesProvider);
final Cigar cigar = read.getCigar();
final byte[] sequence = bytesProvider.apply(read);

if (!cigar.containsOperator(CigarOperator.DELETION) && !cigar.containsOperator(CigarOperator.INSERTION)) {
return sequence;
// As this code is performance sensitive in the HaplotypeCaller, we elect to use the noCopy versions of these getters.
// We can do this because we don't mutate base or quality arrays in this method or in its accessors
final byte[] bases = read.getBasesNoCopy();
final byte[] baseQualities = read.getBaseQualitiesNoCopy();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a brief comment explaining why it's safe to use the no-copy version of these accessors.

final int numCigarElements = read.numCigarElements();
boolean sawIndel = false;

// Check if the cigar contains indels
// Note that we don't call ContainsOperator() here twice to avoid the performance hit of building stream iterators twice
for (int i = 0; i < numCigarElements; i++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make a note explaining why you're not just calling containsOperator() twice here (otherwise someone may come along in a few years and change it back!)

final CigarOperator e = read.getCigarElement(i).getOperator();
if (e == CigarOperator.INSERTION || e == CigarOperator.DELETION) {
sawIndel = true;
break;
}
}
if (!sawIndel) {
return new ImmutablePair<>(bases, baseQualities);
}
else {
final byte[] paddedBases = new byte[CigarUtils.countRefBasesIncludingSoftClips(read, 0, cigar.numCigarElements())];
final int numberRefBasesIncludingSoftclips = CigarUtils.countRefBasesIncludingSoftClips(read, 0, numCigarElements);
final byte[] paddedBases = new byte[numberRefBasesIncludingSoftclips];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

numberRefBasesIncludingSoftclips should be final (looks like you missed this comment from last time?)

final byte[] paddedBaseQualities = new byte[numberRefBasesIncludingSoftclips];
int literalPos = 0;
int paddedPos = 0;
for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
final CigarElement ce = cigar.getCigarElement(i);
for ( int i = 0; i < numCigarElements; i++ ) {
final CigarElement ce = read.getCigarElement(i);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the already-initialized numCigarElements in the for loop condition here instead of read.numCigarElements() (another comment not addressed from last time)

final CigarOperator co = ce.getOperator();
if (co.consumesReadBases()) {
if (!co.consumesReferenceBases()) {
literalPos += ce.getLength(); //skip inserted bases
}
else {
System.arraycopy(sequence, literalPos, paddedBases, paddedPos, ce.getLength());
System.arraycopy(bases, literalPos, paddedBases, paddedPos, ce.getLength());
System.arraycopy(baseQualities, literalPos, paddedBaseQualities, paddedPos, ce.getLength());
literalPos += ce.getLength();
paddedPos += ce.getLength();
}
}
else if (co.consumesReferenceBases()) {
for ( int j = 0; j < ce.getLength(); j++ ) { //pad deleted bases
paddedBases[paddedPos] = padWith;
paddedBases[paddedPos] = basePadCharacter;
paddedBaseQualities[paddedPos] = qualityPadCharacter;
paddedPos++;
}
}
}
return paddedBases;
return new ImmutablePair<>(paddedBases, paddedBaseQualities);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@

import htsjdk.samtools.*;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
Expand Down Expand Up @@ -776,6 +779,50 @@ else if ( middleOp == 'I' && i == startOfIndelBases - 1 )
}
}

///////////////////////////////////////////////////////////////////
// Test AlignmentUtils.getBasesAndBaseQualitiesAlignedOneToOne() //
///////////////////////////////////////////////////////////////////

@DataProvider
public Object[][] makeGetBasesAndBaseQualitiesAlignedOneToOneTest() {
final String readBases = "ATCGATCG";
List<Object[]> tests = new ArrayList<>();

{ // very basic testing
final String cigar1 = "8M";
final String cigar2 = "4M4D4M";
final String cigar3 = "2M3I3M";
final String cigar4 = "2I6M";
final String cigar5 = "6M2I";
final String cigar6 = "1D8M1D";
final String cigar7 = "2M1I1D2M2I1M2D";

tests.add(new Object[]{readBases, cigar1, "ATCGATCG", new byte[]{10, 10, 10, 10, 10, 10, 10, 10}});
tests.add(new Object[]{readBases, cigar2, "ATCG----ATCG", new byte[]{10, 10, 10, 10,0,0,0,0, 10, 10, 10, 10}});
tests.add(new Object[]{readBases, cigar3, "ATTCG", new byte[]{10, 10, 10, 10, 10}});
tests.add(new Object[]{readBases, cigar4, "CGATCG", new byte[]{10, 10, 10, 10, 10, 10}});
tests.add(new Object[]{readBases, cigar5, "ATCGAT", new byte[]{10, 10, 10, 10, 10, 10}});
tests.add(new Object[]{readBases, cigar6, "-ATCGATCG-", new byte[]{0, 10, 10, 10, 10, 10, 10, 10, 10, 0}});
tests.add(new Object[]{readBases, cigar7, "AT-GAG--", new byte[]{10, 10, 0, 10, 10, 10, 0, 0}});
}

return tests.toArray(new Object[][]{});
}


@Test(dataProvider = "makeGetBasesAndBaseQualitiesAlignedOneToOneTest")
public void testGetBasesAndBaseQualitiesAlignedOneToOne(final String readBases, final String cigar, final String expectedBases, final byte[] expectedQuals ) {
final byte qual = (byte)10;
final byte[] quals = Utils.dupBytes(qual, readBases.length());

final GATKRead read = ArtificialReadUtils.createArtificialRead(readBases.getBytes(), quals, cigar);

Pair<byte[], byte[]> actual = AlignmentUtils.getBasesAndBaseQualitiesAlignedOneToOne(read);

Assert.assertEquals(new String(actual.getLeft()), expectedBases);
Assert.assertEquals(actual.getRight(), expectedQuals);
}

//////////////////////////////////////////
// Test AlignmentUtils.leftAlignIndel() //
//////////////////////////////////////////
Expand Down