-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Only compute BIN on BAM write and on index building #1258
Changes from all commits
6e3d454
0c6ab4e
ef54a3c
336f4d2
2f49d75
31f093a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -107,19 +107,22 @@ public void encode(final SAMRecord alignment) { | |||||
// Compute block size, as it is the first element of the file representation of SAMRecord | ||||||
final int readLength = alignment.getReadLength(); | ||||||
|
||||||
// if cigar is too long, put into CG tag and replace with sentinel value | ||||||
if (alignment.getCigarLength() > BAMRecord.MAX_CIGAR_OPERATORS) { | ||||||
// If cigar is too long, put into CG tag and replace with sentinel value. | ||||||
// Using alignment.getCigarLength() here causes problems, so access the cigar instead | ||||||
final Cigar cigarToWrite; | ||||||
final boolean cigarSwitcharoo = alignment.getCigar().numCigarElements() > BAMRecord.MAX_CIGAR_OPERATORS; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the change from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The short answer is that some of the tests fail if I leave it as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That scares the hell out of me... those seem like they should be identical.. |
||||||
|
||||||
if (cigarSwitcharoo) { | ||||||
final int[] cigarEncoding = BinaryCigarCodec.encode(alignment.getCigar()); | ||||||
alignment.setCigar(makeSentinelCigar(alignment.getCigar())); | ||||||
alignment.setAttribute(CG.name(), cigarEncoding); | ||||||
cigarToWrite = makeSentinelCigar(alignment.getCigar()); | ||||||
} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. format
|
||||||
else { | ||||||
cigarToWrite = alignment.getCigar(); | ||||||
} | ||||||
|
||||||
// do not combine with previous call to alignment.getCigarLength() as cigar may change in-between | ||||||
final int cigarLength = alignment.getCigarLength(); | ||||||
|
||||||
int blockSize = BAMFileConstants.FIXED_BLOCK_SIZE + alignment.getReadNameLength() + 1 + // null terminated | ||||||
cigarLength * BAMRecord.CIGAR_SIZE_MULTIPLIER + | ||||||
cigarToWrite.numCigarElements() * BAMRecord.CIGAR_SIZE_MULTIPLIER + | ||||||
(readLength + 1) / 2 + // 2 bases per byte, round up | ||||||
readLength; | ||||||
|
||||||
|
@@ -139,8 +142,9 @@ public void encode(final SAMRecord alignment) { | |||||
// the actual cigar. | ||||||
int indexBin = 0; | ||||||
if (alignment.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { | ||||||
warnIfReferenceIsTooLargeForBinField(alignment); | ||||||
indexBin = alignment.computeIndexingBinIfAbsent(alignment); | ||||||
if (!warnIfReferenceIsTooLargeForBinField(alignment)) { | ||||||
indexBin = alignment.computeIndexingBin(); | ||||||
} | ||||||
} | ||||||
|
||||||
// Blurt out the elements | ||||||
|
@@ -151,7 +155,7 @@ public void encode(final SAMRecord alignment) { | |||||
this.binaryCodec.writeUByte((short) (alignment.getReadNameLength() + 1)); | ||||||
this.binaryCodec.writeUByte((short) alignment.getMappingQuality()); | ||||||
this.binaryCodec.writeUShort(indexBin); | ||||||
this.binaryCodec.writeUShort(cigarLength); | ||||||
this.binaryCodec.writeUShort(cigarToWrite.numCigarElements()); | ||||||
this.binaryCodec.writeUShort(alignment.getFlags()); | ||||||
this.binaryCodec.writeInt(alignment.getReadLength()); | ||||||
this.binaryCodec.writeInt(alignment.getMateReferenceIndex()); | ||||||
|
@@ -170,7 +174,7 @@ public void encode(final SAMRecord alignment) { | |||||
"; quals length: " + alignment.getBaseQualities().length); | ||||||
} | ||||||
this.binaryCodec.writeString(alignment.getReadName(), false, true); | ||||||
final int[] binaryCigar = BinaryCigarCodec.encode(alignment.getCigar()); | ||||||
final int[] binaryCigar = BinaryCigarCodec.encode(cigarToWrite); | ||||||
for (final int cigarElement : binaryCigar) { | ||||||
// Assumption that this will fit into an integer, despite the fact | ||||||
// that it is spec'ed as a uint. | ||||||
|
@@ -194,6 +198,10 @@ public void encode(final SAMRecord alignment) { | |||||
attribute = attribute.getNext(); | ||||||
} | ||||||
} | ||||||
|
||||||
if (cigarSwitcharoo) { | ||||||
alignment.setAttribute(CG.name(), null); | ||||||
} | ||||||
} | ||||||
|
||||||
/** | ||||||
|
@@ -223,15 +231,21 @@ public static Cigar makeSentinelCigar(final Cigar cigar) { | |||||
new CigarElement(cigar.getReferenceLength(), CigarOperator.N))); | ||||||
} | ||||||
|
||||||
private void warnIfReferenceIsTooLargeForBinField(final SAMRecord rec) { | ||||||
/** Emits a warning the first time a reference too large for binning indexing is encountered. | ||||||
* | ||||||
* @param rec the SAMRecord to examine | ||||||
* @return true if the sequence is too large, false otherwise | ||||||
*/ | ||||||
private boolean warnIfReferenceIsTooLargeForBinField(final SAMRecord rec) { | ||||||
final SAMSequenceRecord sequence = rec.getHeader() != null ? rec.getHeader().getSequence(rec.getReferenceName()) : null; | ||||||
if (!isReferenceSizeWarningShowed | ||||||
&& sequence != null | ||||||
&& SAMUtils.isReferenceSequenceCompatibleWithBAI(sequence) | ||||||
&& rec.getValidationStringency() != ValidationStringency.SILENT) { | ||||||
LOG.warn("Reference length is too large for BAM bin field. Values in the bin field could be incorrect."); | ||||||
final boolean tooLarge = sequence != null && SAMUtils.isReferenceSequenceIncompatibleWithBAI(sequence); | ||||||
if (!isReferenceSizeWarningShowed && tooLarge && rec.getValidationStringency() != ValidationStringency.SILENT) { | ||||||
LOG.warn("Reference length is too large for BAM bin field."); | ||||||
LOG.warn("Reads on references longer than " + GenomicIndexUtil.BIN_GENOMIC_SPAN + "bp will have bin set to 0."); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing space in warning
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is intentional. It's normal to write |
||||||
isReferenceSizeWarningShowed = true; | ||||||
} | ||||||
|
||||||
return tooLarge; | ||||||
} | ||||||
|
||||||
/** | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -188,7 +188,6 @@ public class SAMRecord implements Cloneable, Locatable, Serializable { | |
private SAMBinaryTagAndValue mAttributes = null; | ||
protected Integer mReferenceIndex = null; | ||
protected Integer mMateReferenceIndex = null; | ||
private Integer mIndexingBin = null; | ||
|
||
/** | ||
* Some attributes (e.g. CIGAR) are not decoded immediately. Use this to decide how to validate when decoded. | ||
|
@@ -595,8 +594,6 @@ public void setAlignmentStart(final int value) { | |
mAlignmentStart = value; | ||
// Clear cached alignment end | ||
mAlignmentEnd = NO_ALIGNMENT_START; | ||
// Change to alignmentStart could change indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -806,8 +803,6 @@ public void setCigarString(final String value) { | |
mAlignmentBlocks = null; | ||
// Clear cached alignment end | ||
mAlignmentEnd = NO_ALIGNMENT_START; | ||
// Change to cigar could change alignmentEnd, and thus indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -844,8 +839,6 @@ public int getCigarLength() { | |
*/ | ||
public void setCigar(final Cigar cigar) { | ||
initializeCigar(cigar); | ||
// Change to cigar could change alignmentEnd, and thus indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -886,8 +879,6 @@ public int getFlags() { | |
|
||
public void setFlags(final int value) { | ||
mFlags = value; | ||
// Could imply change to readUnmapped flag, which could change indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -1043,8 +1034,6 @@ public void setReadUmappedFlag(final boolean flag) { | |
*/ | ||
public void setReadUnmappedFlag(final boolean flag) { | ||
setFlag(flag, SAMFlag.READ_UNMAPPED.flag); | ||
// Change to readUnmapped could change indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -1564,52 +1553,34 @@ public List<SAMTagAndValue> getAttributes() { | |
return ret; | ||
} | ||
|
||
Integer getIndexingBin() { | ||
return mIndexingBin; | ||
} | ||
|
||
/** | ||
* Used internally when writing BAMRecords. | ||
* @param mIndexingBin c.f. http://samtools.sourceforge.net/SAM1.pdf | ||
* @deprecated Use computeIndexingBin() if accessible or GenomicIndexUtil.regionToBin() otherwise. | ||
*/ | ||
void setIndexingBin(final Integer mIndexingBin) { | ||
this.mIndexingBin = mIndexingBin; | ||
} | ||
@Deprecated() public int computeIndexingBinIfAbsent(final SAMRecord alignment) { return alignment.computeIndexingBin(); } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. format across lines |
||
|
||
/** | ||
* Does not change state of this. | ||
* Computes the BAI indexing bin for the record. Invokes getAlignmentEnd() among other methods, which may | ||
* cause the record to deserialize/parse the cigar is necessary. | ||
* | ||
* @return indexing bin based on alignment start & end. | ||
*/ | ||
int computeIndexingBin() { | ||
// regionToBin has zero-based, half-open API | ||
final int alignmentStart = getAlignmentStart()-1; | ||
final int alignmentStart = getAlignmentStart() - 1; // BIN uses 0-based half-open | ||
int alignmentEnd = getAlignmentEnd(); | ||
yfarjoun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (alignmentEnd <= 0) { | ||
// If alignment end cannot be determined (e.g. because this read is not really aligned), | ||
// then treat this as a one base alignment for indexing purposes. | ||
alignmentEnd = alignmentStart + 1; | ||
} | ||
return GenomicIndexUtil.regionToBin(alignmentStart, alignmentEnd); | ||
} | ||
|
||
|
||
/** | ||
* Gets indexing bin if it is present otherwise compute it. | ||
* @param alignment to compute indexing bin for. | ||
* @return indexing bin. | ||
*/ | ||
public int computeIndexingBinIfAbsent(final SAMRecord alignment) { | ||
yfarjoun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (alignment.getIndexingBin() != null) { | ||
return alignment.getIndexingBin(); | ||
} else { | ||
return getUshort(alignment.computeIndexingBin()); | ||
if (alignmentStart > GenomicIndexUtil.BIN_GENOMIC_SPAN || alignmentEnd > GenomicIndexUtil.BIN_GENOMIC_SPAN) { | ||
throw new IllegalStateException("Read position too high for BAI bin indexing."); | ||
} | ||
} | ||
|
||
private int getUshort(int value) { | ||
return value & (int) BinaryCodec.MAX_USHORT; | ||
return GenomicIndexUtil.regionToBin(alignmentStart, alignmentEnd) & (int) BinaryCodec.MAX_USHORT; | ||
} | ||
|
||
|
||
/** | ||
* @return the SAMFileHeader for this record. If the header is null, the following SAMRecord methods may throw | ||
* exceptions: | ||
|
@@ -1854,8 +1825,6 @@ public boolean equals(final Object o) { | |
if (mInferredInsertSize != samRecord.mInferredInsertSize) return false; | ||
if (mMappingQuality != samRecord.mMappingQuality) return false; | ||
if (mMateAlignmentStart != samRecord.mMateAlignmentStart) return false; | ||
if (mIndexingBin != null ? !mIndexingBin.equals(samRecord.mIndexingBin) : samRecord.mIndexingBin != null) | ||
return false; | ||
if (mMateReferenceIndex != null ? !mMateReferenceIndex.equals(samRecord.mMateReferenceIndex) : samRecord.mMateReferenceIndex != null) | ||
return false; | ||
if (mReferenceIndex != null ? !mReferenceIndex.equals(samRecord.mReferenceIndex) : samRecord.mReferenceIndex != null) | ||
|
@@ -1896,7 +1865,6 @@ public int hashCode() { | |
result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); | ||
result = 31 * result + (mReferenceIndex != null ? mReferenceIndex.hashCode() : 0); | ||
result = 31 * result + (mMateReferenceIndex != null ? mMateReferenceIndex.hashCode() : 0); | ||
result = 31 * result + (mIndexingBin != null ? mIndexingBin.hashCode() : 0); | ||
return result; | ||
} | ||
|
||
|
@@ -2112,27 +2080,6 @@ public List<SAMValidationError> isValid(final boolean firstOnly) { | |
if (firstOnly) return ret; | ||
} | ||
|
||
if (this.getAlignmentStart() != NO_ALIGNMENT_START) { | ||
final SAMSequenceRecord sequence = getHeader() != null ? getHeader().getSequence(getReferenceName()) : null; | ||
if (sequence != null && SAMUtils.isReferenceSequenceCompatibleWithBAI(sequence)) { | ||
if (getValidationStringency() != ValidationStringency.SILENT) { | ||
if (getReferenceName() != null) { | ||
LOG.warn(String.format("%s reference length is too large for BAM bin field. %s record bin field value is incorrect.", | ||
getReferenceName(), getReadName())); | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think keeping the reference length is too long for bam warning is a good idea. It's still problem you would want to know about if you're validating your bam I think. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is actually my primary reason for the PR in the first place! With this check here, unless you set validation stringency to silent, you get a warning logged per record when reading. So I'm getting millions of warnings! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh.. of course.. that makes sense. Maybe there's a way we can have ValidateSam warn exactly once if it encounters reads like that but not have it happen in the general case of just processing a sam file with validation on. |
||
LOG.warn(String.format("Reference length is too large for BAM bin field. %s record bin field value is incorrect.", | ||
getReadName())); | ||
} | ||
} | ||
} else if (isIndexingBinNotEqualsComputedBin()) { | ||
if (ret == null) ret = new ArrayList<>(); | ||
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN, | ||
"bin field of BAM record does not equal value computed based on alignment start and end, and length of sequence to which read is aligned", | ||
getReadName())); | ||
if (firstOnly) return ret; | ||
} | ||
} | ||
|
||
if (getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) && | ||
getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { | ||
if (ret == null) ret = new ArrayList<>(); | ||
|
@@ -2165,10 +2112,6 @@ public List<SAMValidationError> isValid(final boolean firstOnly) { | |
return ret; | ||
} | ||
|
||
private boolean isIndexingBinNotEqualsComputedBin() { | ||
return this.getIndexingBin() != null && this.computeIndexingBin() != this.getIndexingBin(); | ||
} | ||
|
||
/** | ||
* Gets the source of this SAM record -- both the reader that retrieved the record and the position on disk from | ||
* whence it came. | ||
|
@@ -2282,7 +2225,6 @@ public SAMRecord deepCopy() { | |
if (null != attributes) { | ||
newSAM.setAttributes(attributes.deepCopy()); | ||
} | ||
newSAM.setIndexingBin(getIndexingBin()); | ||
|
||
return newSAM; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1250,11 +1250,19 @@ public static List<SAMRecord> getOtherCanonicalAlignments(final SAMRecord record | |
return alignments; | ||
} | ||
|
||
/** | ||
* @deprecated because the method does the exact opposite of what it says. Use the correctly named | ||
* isReferenceSequenceIncompatibleWithBAI() instead. | ||
*/ | ||
@Deprecated public static boolean isReferenceSequenceCompatibleWithBAI(final SAMSequenceRecord sequence) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh god, that's horrible There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed. I was really confused initially. |
||
return isReferenceSequenceIncompatibleWithBAI(sequence); | ||
} | ||
|
||
/** | ||
* Checks if reference sequence is compatible with BAI indexing format. | ||
* @param sequence reference sequence. | ||
*/ | ||
public static boolean isReferenceSequenceCompatibleWithBAI(final SAMSequenceRecord sequence) { | ||
yfarjoun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public static boolean isReferenceSequenceIncompatibleWithBAI(final SAMSequenceRecord sequence) { | ||
return sequence.getSequenceLength() > GenomicIndexUtil.BIN_GENOMIC_SPAN; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
format over lines, e.g.