Skip to content

Commit

Permalink
update CramCompressionRecord.isPlaced() to make it APDelta-aware (#1284)
Browse files Browse the repository at this point in the history
* update isPlaced() to make it APDelta-aware
* refactor SliceTests
* intializeAlignmentBoundaries and comments
* refactor testBuildStates
* Add test_getAlignmentSpan and improve values of test_getAlignmentEnd
* combinatoric placedTests
* better unmapped consistency for SliceTests
* assertSpanAndEnd
* alignment span and end tests
* Note bug 1301 in comments
  • Loading branch information
jmthibault79 authored Feb 21, 2019
1 parent efe4abf commit 061e217
Show file tree
Hide file tree
Showing 8 changed files with 436 additions and 259 deletions.
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,8 @@ protected void flushContainer() throws IllegalArgumentException, IllegalAccessEx
while (last.next != null) last = last.next;

if (cramRecord.isFirstSegment() && last.isLastSegment()) {

final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last);
final boolean coordinateSorted = (samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate);
final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last, coordinateSorted);

if (cramRecord.templateSize == templateLength) {
last = cramRecord.next;
Expand Down
16 changes: 10 additions & 6 deletions src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ public void normalize(final ArrayList<CramCompressionRecord> records,
for (final CramCompressionRecord record : records) {
if (record.previous != null) continue;
if (record.next == null) continue;
restoreMateInfo(record);
final boolean usePositionDeltaEncoding = header.getSortOrder() == SAMFileHeader.SortOrder.coordinate;
restoreMateInfo(record, usePositionDeltaEncoding);
}
}

Expand Down Expand Up @@ -137,7 +138,8 @@ public void normalize(final ArrayList<CramCompressionRecord> records,
restoreQualityScores(defaultQualityScore, records);
}

private static void restoreMateInfo(final CramCompressionRecord record) {
private static void restoreMateInfo(final CramCompressionRecord record,
final boolean usePositionDeltaEncoding) {
if (record.next == null) {

return;
Expand All @@ -155,7 +157,7 @@ private static void restoreMateInfo(final CramCompressionRecord record) {
// record.setFirstSegment(true);
// last.setLastSegment(true);

final int templateLength = computeInsertSize(record, last);
final int templateLength = computeInsertSize(record, last, usePositionDeltaEncoding);
record.templateSize = templateLength;
last.templateSize = -templateLength;
}
Expand Down Expand Up @@ -326,19 +328,21 @@ private static byte getByteOrDefault(final byte[] array, final int pos,
*
* @param firstEnd first mate of the pair
* @param secondEnd second mate of the pair
* @param usePositionDeltaEncoding do these records delta-encode their alignment starts?
* @return template length
*/
public static int computeInsertSize(final CramCompressionRecord firstEnd,
final CramCompressionRecord secondEnd) {
final CramCompressionRecord secondEnd,
final boolean usePositionDeltaEncoding) {
if (firstEnd.isSegmentUnmapped() || secondEnd.isSegmentUnmapped()) {
return 0;
}
if (firstEnd.sequenceId != secondEnd.sequenceId) {
return 0;
}

final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd() : firstEnd.alignmentStart;
final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd() : secondEnd.alignmentStart;
final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd(usePositionDeltaEncoding) : firstEnd.alignmentStart;
final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd(usePositionDeltaEncoding) : secondEnd.alignmentStart;

final int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1;
return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import htsjdk.samtools.cram.encoding.readfeatures.Insertion;
import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature;
import htsjdk.samtools.cram.encoding.readfeatures.SoftClip;
import htsjdk.samtools.util.Log;

import java.util.Arrays;
import java.util.Collection;
Expand All @@ -49,12 +50,17 @@ public class CramCompressionRecord {
private static final int HAS_MATE_DOWNSTREAM_FLAG = 0x4;
private static final int UNKNOWN_BASES = 0x8;

private static final int UNINITIALIZED_END = -1;
private static final int UNINITIALIZED_SPAN = -1;

private static final Log log = Log.getInstance(CramCompressionRecord.class);

// sequential index of the record in a stream:
public int index = 0;
public int alignmentStart;
public int alignmentDelta;
private int alignmentEnd = -1;
private int alignmentSpan = -1;
private int alignmentEnd = UNINITIALIZED_END;
private int alignmentSpan = UNINITIALIZED_SPAN;

public int readLength;

Expand Down Expand Up @@ -152,20 +158,43 @@ public String toString() {
return stringBuilder.toString();
}

public int getAlignmentSpan() {
if (alignmentSpan < 0) calculateAlignmentBoundaries();
/**
* Check whether alignmentSpan has been initialized, and do so if it has not
* @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced()
* @return the initialized alignmentSpan
*/
public int getAlignmentSpan(final boolean usePositionDeltaEncoding) {
if (alignmentSpan == UNINITIALIZED_SPAN) {
intializeAlignmentBoundaries(usePositionDeltaEncoding);
}
return alignmentSpan;
}

void calculateAlignmentBoundaries() {
if (isSegmentUnmapped()) {
alignmentSpan = 0;
alignmentEnd = SAMRecord.NO_ALIGNMENT_START;
} else if (readFeatures == null || readFeatures.isEmpty()) {
alignmentSpan = readLength;
alignmentEnd = alignmentStart + alignmentSpan - 1;
} else {
alignmentSpan = readLength;
/**
* Check whether alignmentEnd has been initialized, and do so if it has not
* @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced()
* @return the initialized alignmentEnd
*/
public int getAlignmentEnd(final boolean usePositionDeltaEncoding) {
if (alignmentEnd == UNINITIALIZED_END) {
intializeAlignmentBoundaries(usePositionDeltaEncoding);
}
return alignmentEnd;
}

// https://github.com/samtools/htsjdk/issues/1301
// does not update alignmentSpan/alignmentEnd when the record changes

private void intializeAlignmentBoundaries(final boolean usePositionDeltaEncoding) {
if (!isPlaced(usePositionDeltaEncoding)) {
alignmentSpan = Slice.NO_ALIGNMENT_SPAN;
alignmentEnd = Slice.NO_ALIGNMENT_END;
return;
}

alignmentSpan = readLength;

if (readFeatures != null) {
for (final ReadFeature readFeature : readFeatures) {
switch (readFeature.getOperator()) {
case InsertBase.operator:
Expand All @@ -185,13 +214,9 @@ void calculateAlignmentBoundaries() {
break;
}
}
alignmentEnd = alignmentStart + alignmentSpan - 1;
}
}

public int getAlignmentEnd() {
if (alignmentEnd < 0) calculateAlignmentBoundaries();
return alignmentEnd;
alignmentEnd = alignmentStart + alignmentSpan - 1;
}

public boolean isMultiFragment() {
Expand All @@ -207,7 +232,7 @@ public void setMultiFragment(final boolean multiFragment) {
* Unmapped records may be stored in the same {@link Slice}s and {@link Container}s as mapped
* records if they are placed.
*
* @see #isPlaced()
* @see #isPlaced(boolean)
* @return true if the record is unmapped
*/
public boolean isSegmentUnmapped() {
Expand All @@ -219,16 +244,42 @@ public void setSegmentUnmapped(final boolean segmentUnmapped) {
}

/**
* Does this record have a valid placement/alignment location? This is independent of mapping status.
* It must have both a valid reference sequence ID and alignment start position to qualify.
* Unplaced reads may only be stored in Unmapped or Multiple Reference {@link Slice}s and {@link Container}s.
* Does this record have a valid placement/alignment location?
*
* It must have a valid reference sequence ID to be considered placed.
* In the case of absolute (non-delta) position encoding, it must also have a
* valid alignment start position, so we need to know if it is delta-encoded.
*
* Normally we expect to see that the unmapped flag is set for unplaced reads,
* so we log a WARNING here if the read is unplaced yet somehow mapped.
*
* @see #isSegmentUnmapped()
* @param usePositionDeltaEncoding is this read's position delta-encoded? if not, check alignment Start.
* @return true if the record is placed
*/
public boolean isPlaced() {
return sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX &&
alignmentStart != SAMRecord.NO_ALIGNMENT_START;
public boolean isPlaced(final boolean usePositionDeltaEncoding) {
boolean placed = isPlacedInternal(!usePositionDeltaEncoding);

if (!placed && !isSegmentUnmapped()) {
final String warning = String.format(
"Cram Compression Record [%s] does not have the unmapped flag set, " +
"but also does not have a valid placement on a reference sequence.",
this.toString());
log.warn(warning);
}

return placed;
}

// check placement without regard to mapping; helper method for isPlaced()
private boolean isPlacedInternal(final boolean useAbsolutePositionEncoding) {
// placement requires a valid sequence ID
if (sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
return false;
}

// if an absolute alignment start coordinate is required but we have none, it's unplaced
return ! (useAbsolutePositionEncoding && alignmentStart == SAMRecord.NO_ALIGNMENT_START);
}

public boolean isFirstSegment() {
Expand Down
9 changes: 5 additions & 4 deletions src/main/java/htsjdk/samtools/cram/structure/Slice.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public class Slice {
public static final int MULTI_REFERENCE = -2;
public static final int NO_ALIGNMENT_START = -1;
public static final int NO_ALIGNMENT_SPAN = 0;
public static final int NO_ALIGNMENT_END = SAMRecord.NO_ALIGNMENT_START; // 0
private static final Log log = Log.getInstance(Slice.class);

// as defined in the specs:
Expand Down Expand Up @@ -387,7 +388,7 @@ public static Slice buildSlice(final List<CramCompressionRecord> records,
// Unmapped: all records are unmapped and unplaced

final CramCompressionRecord firstRecord = records.get(0);
slice.sequenceId = firstRecord.isPlaced() ?
slice.sequenceId = firstRecord.isPlaced(header.APDelta) ?
firstRecord.sequenceId :
SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;

Expand All @@ -397,19 +398,19 @@ public static Slice buildSlice(final List<CramCompressionRecord> records,
hasher.add(record);

// flip an unmapped slice to multi-ref if the record is placed
if (slice.isUnmapped() && record.isPlaced()) {
if (slice.isUnmapped() && record.isPlaced(header.APDelta)) {
slice.sequenceId = MULTI_REFERENCE;
}
else if (slice.isMappedSingleRef()) {
// flip a single-ref slice to multi-ref if the record is unplaced or on a different ref
if (!record.isPlaced() || slice.sequenceId != record.sequenceId) {
if (!record.isPlaced(header.APDelta) || slice.sequenceId != record.sequenceId) {
slice.sequenceId = MULTI_REFERENCE;
}
else {
// calculate single ref slice alignment

minAlStart = Math.min(record.alignmentStart, minAlStart);
maxAlEnd = Math.max(record.getAlignmentEnd(), maxAlEnd);
maxAlEnd = Math.max(record.getAlignmentEnd(header.APDelta), maxAlEnd);
}
}
}
Expand Down
25 changes: 25 additions & 0 deletions src/test/java/htsjdk/samtools/cram/build/ContainerFactoryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,21 @@ static List<CramCompressionRecord> getSingleRefRecords(final int recordCount) {
return records;
}

static List<CramCompressionRecord> getUnmappedRecords() {
final List<CramCompressionRecord> records = new ArrayList<>();
for (int i = 0; i < 10; i++) {
final CramCompressionRecord record = createMappedRecord(i);
record.setSegmentUnmapped(true);
record.alignmentStart = SAMRecord.NO_ALIGNMENT_START;
record.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
records.add(record);
}
return records;
}

// these two sets of records are "half" unplaced: they have either a valid reference index or start position,
// but not both. We treat these weird edge cases as unplaced.

static List<CramCompressionRecord> getUnmappedNoRefRecords() {
final List<CramCompressionRecord> records = new ArrayList<>();
for (int i = 0; i < 10; i++) {
Expand Down Expand Up @@ -150,6 +165,16 @@ public void testMapped() {
assertContainerState(container, recordCount, sequenceId, alignmentStart, alignmentSpan);
}

@Test
public void testUnmapped() {
final ContainerFactory factory = new ContainerFactory(getSAMFileHeaderForTests(), 10);
final Container container = factory.buildContainer(getUnmappedRecords());
assertContainerState(container, 10, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN);
}

// these two sets of records are "half" unplaced: they have either a valid reference index or start position,
// but not both. We treat these weird edge cases as unplaced.

@Test
public void testUnmappedNoReferenceId() {
final ContainerFactory factory = new ContainerFactory(getSAMFileHeaderForTests(), 10);
Expand Down
10 changes: 10 additions & 0 deletions src/test/java/htsjdk/samtools/cram/build/ContainerParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,16 @@ private Object[][] refTestData() {
add(0);
}}
},
{
ContainerFactoryTest.getUnmappedRecords(),
new HashSet<Integer>() {{
add(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
}}
},

// these two sets of records are "half" unplaced: they have either a valid reference index or start position,
// but not both. We treat these weird edge cases as unplaced.

{
ContainerFactoryTest.getUnmappedNoRefRecords(),
new HashSet<Integer>() {{
Expand Down
Loading

0 comments on commit 061e217

Please sign in to comment.