-
Notifications
You must be signed in to change notification settings - Fork 245
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Only compute BIN on BAM write and on index building #1258
Changes from 2 commits
6e3d454
0c6ab4e
ef54a3c
336f4d2
2f49d75
31f093a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -108,18 +108,20 @@ public void encode(final SAMRecord alignment) { | |||||
final int readLength = alignment.getReadLength(); | ||||||
|
||||||
// if cigar is too long, put into CG tag and replace with sentinel value | ||||||
if (alignment.getCigarLength() > BAMRecord.MAX_CIGAR_OPERATORS) { | ||||||
final Cigar cigarToWrite; | ||||||
final boolean cigarSwitcharoo = alignment.getCigar().numCigarElements() > BAMRecord.MAX_CIGAR_OPERATORS; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the change from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The short answer is that some of the tests fail if I leave it as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That scares the hell out of me... those seem like they should be identical.. |
||||||
|
||||||
if (cigarSwitcharoo) { | ||||||
final int[] cigarEncoding = BinaryCigarCodec.encode(alignment.getCigar()); | ||||||
alignment.setCigar(makeSentinelCigar(alignment.getCigar())); | ||||||
alignment.setAttribute(CG.name(), cigarEncoding); | ||||||
cigarToWrite = makeSentinelCigar(alignment.getCigar()); | ||||||
} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. format
|
||||||
else { | ||||||
cigarToWrite = alignment.getCigar(); | ||||||
} | ||||||
|
||||||
// do not combine with previous call to alignment.getCigarLength() as cigar may change in-between | ||||||
final int cigarLength = alignment.getCigarLength(); | ||||||
|
||||||
int blockSize = BAMFileConstants.FIXED_BLOCK_SIZE + alignment.getReadNameLength() + 1 + // null terminated | ||||||
cigarLength * BAMRecord.CIGAR_SIZE_MULTIPLIER + | ||||||
cigarToWrite.numCigarElements() * BAMRecord.CIGAR_SIZE_MULTIPLIER + | ||||||
(readLength + 1) / 2 + // 2 bases per byte, round up | ||||||
readLength; | ||||||
|
||||||
|
@@ -139,8 +141,9 @@ public void encode(final SAMRecord alignment) { | |||||
// the actual cigar. | ||||||
int indexBin = 0; | ||||||
if (alignment.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { | ||||||
warnIfReferenceIsTooLargeForBinField(alignment); | ||||||
indexBin = alignment.computeIndexingBinIfAbsent(alignment); | ||||||
if (!warnIfReferenceIsTooLargeForBinField(alignment)) { | ||||||
indexBin = alignment.computeIndexingBin(); | ||||||
} | ||||||
} | ||||||
|
||||||
// Blurt out the elements | ||||||
|
@@ -151,7 +154,7 @@ public void encode(final SAMRecord alignment) { | |||||
this.binaryCodec.writeUByte((short) (alignment.getReadNameLength() + 1)); | ||||||
this.binaryCodec.writeUByte((short) alignment.getMappingQuality()); | ||||||
this.binaryCodec.writeUShort(indexBin); | ||||||
this.binaryCodec.writeUShort(cigarLength); | ||||||
this.binaryCodec.writeUShort(cigarToWrite.numCigarElements()); | ||||||
this.binaryCodec.writeUShort(alignment.getFlags()); | ||||||
this.binaryCodec.writeInt(alignment.getReadLength()); | ||||||
this.binaryCodec.writeInt(alignment.getMateReferenceIndex()); | ||||||
|
@@ -170,7 +173,7 @@ public void encode(final SAMRecord alignment) { | |||||
"; quals length: " + alignment.getBaseQualities().length); | ||||||
} | ||||||
this.binaryCodec.writeString(alignment.getReadName(), false, true); | ||||||
final int[] binaryCigar = BinaryCigarCodec.encode(alignment.getCigar()); | ||||||
final int[] binaryCigar = BinaryCigarCodec.encode(cigarToWrite); | ||||||
for (final int cigarElement : binaryCigar) { | ||||||
// Assumption that this will fit into an integer, despite the fact | ||||||
// that it is spec'ed as a uint. | ||||||
|
@@ -194,6 +197,10 @@ public void encode(final SAMRecord alignment) { | |||||
attribute = attribute.getNext(); | ||||||
} | ||||||
} | ||||||
|
||||||
if (cigarSwitcharoo) { | ||||||
alignment.setAttribute(CG.name(), null); | ||||||
} | ||||||
} | ||||||
|
||||||
/** | ||||||
|
@@ -223,15 +230,21 @@ public static Cigar makeSentinelCigar(final Cigar cigar) { | |||||
new CigarElement(cigar.getReferenceLength(), CigarOperator.N))); | ||||||
} | ||||||
|
||||||
private void warnIfReferenceIsTooLargeForBinField(final SAMRecord rec) { | ||||||
/** Emits a warning the first time a reference too large for binning indexing is encountered. | ||||||
* | ||||||
* @param rec the SAMRecord to examine | ||||||
* @return true if the sequence is too large, false otherwise | ||||||
*/ | ||||||
private boolean warnIfReferenceIsTooLargeForBinField(final SAMRecord rec) { | ||||||
final SAMSequenceRecord sequence = rec.getHeader() != null ? rec.getHeader().getSequence(rec.getReferenceName()) : null; | ||||||
if (!isReferenceSizeWarningShowed | ||||||
&& sequence != null | ||||||
&& SAMUtils.isReferenceSequenceCompatibleWithBAI(sequence) | ||||||
&& rec.getValidationStringency() != ValidationStringency.SILENT) { | ||||||
LOG.warn("Reference length is too large for BAM bin field. Values in the bin field could be incorrect."); | ||||||
final boolean tooLarge = sequence != null && SAMUtils.isReferenceSequenceIncompatibleWithBAI(sequence); | ||||||
if (!isReferenceSizeWarningShowed & tooLarge && rec.getValidationStringency() != ValidationStringency.SILENT) { | ||||||
tfenne marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
LOG.warn("Reference length is too large for BAM bin field."); | ||||||
LOG.warn("Reads on references longer than " + GenomicIndexUtil.BIN_GENOMIC_SPAN + "bp will have bin set to 0."); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing space in warning
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is intentional. It's normal to write |
||||||
isReferenceSizeWarningShowed = true; | ||||||
} | ||||||
|
||||||
return tooLarge; | ||||||
} | ||||||
|
||||||
/** | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -188,7 +188,6 @@ public class SAMRecord implements Cloneable, Locatable, Serializable { | |
private SAMBinaryTagAndValue mAttributes = null; | ||
protected Integer mReferenceIndex = null; | ||
protected Integer mMateReferenceIndex = null; | ||
private Integer mIndexingBin = null; | ||
|
||
/** | ||
* Some attributes (e.g. CIGAR) are not decoded immediately. Use this to decide how to validate when decoded. | ||
|
@@ -595,8 +594,6 @@ public void setAlignmentStart(final int value) { | |
mAlignmentStart = value; | ||
// Clear cached alignment end | ||
mAlignmentEnd = NO_ALIGNMENT_START; | ||
// Change to alignmentStart could change indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -806,8 +803,6 @@ public void setCigarString(final String value) { | |
mAlignmentBlocks = null; | ||
// Clear cached alignment end | ||
mAlignmentEnd = NO_ALIGNMENT_START; | ||
// Change to cigar could change alignmentEnd, and thus indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -844,8 +839,6 @@ public int getCigarLength() { | |
*/ | ||
public void setCigar(final Cigar cigar) { | ||
initializeCigar(cigar); | ||
// Change to cigar could change alignmentEnd, and thus indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -886,8 +879,6 @@ public int getFlags() { | |
|
||
public void setFlags(final int value) { | ||
mFlags = value; | ||
// Could imply change to readUnmapped flag, which could change indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -1043,8 +1034,6 @@ public void setReadUmappedFlag(final boolean flag) { | |
*/ | ||
public void setReadUnmappedFlag(final boolean flag) { | ||
setFlag(flag, SAMFlag.READ_UNMAPPED.flag); | ||
// Change to readUnmapped could change indexing bin | ||
setIndexingBin(null); | ||
} | ||
|
||
/** | ||
|
@@ -1564,23 +1553,15 @@ public List<SAMTagAndValue> getAttributes() { | |
return ret; | ||
} | ||
|
||
Integer getIndexingBin() { | ||
return mIndexingBin; | ||
} | ||
|
||
/** | ||
* Used internally when writing BAMRecords. | ||
* @param mIndexingBin c.f. http://samtools.sourceforge.net/SAM1.pdf | ||
*/ | ||
void setIndexingBin(final Integer mIndexingBin) { | ||
this.mIndexingBin = mIndexingBin; | ||
} | ||
|
||
/** | ||
* Does not change state of this. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. update comment to reflect the fact that there is a possible side-effect due to the need to parse the cigar. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
* @return indexing bin based on alignment start & end. | ||
*/ | ||
int computeIndexingBin() { | ||
if (getAlignmentStart() > GenomicIndexUtil.BIN_GENOMIC_SPAN || getAlignmentEnd() > GenomicIndexUtil.BIN_GENOMIC_SPAN) { | ||
throw new IllegalStateException("Read position too high for BAI bin indexing."); | ||
} | ||
|
||
// regionToBin has zero-based, half-open API | ||
final int alignmentStart = getAlignmentStart()-1; | ||
int alignmentEnd = getAlignmentEnd(); | ||
yfarjoun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
@@ -1589,26 +1570,10 @@ int computeIndexingBin() { | |
// then treat this as a one base alignment for indexing purposes. | ||
alignmentEnd = alignmentStart + 1; | ||
} | ||
return GenomicIndexUtil.regionToBin(alignmentStart, alignmentEnd); | ||
} | ||
|
||
|
||
/** | ||
* Gets indexing bin if it is present otherwise compute it. | ||
* @param alignment to compute indexing bin for. | ||
* @return indexing bin. | ||
*/ | ||
public int computeIndexingBinIfAbsent(final SAMRecord alignment) { | ||
yfarjoun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (alignment.getIndexingBin() != null) { | ||
return alignment.getIndexingBin(); | ||
} else { | ||
return getUshort(alignment.computeIndexingBin()); | ||
} | ||
return GenomicIndexUtil.regionToBin(alignmentStart, alignmentEnd) & (int) BinaryCodec.MAX_USHORT; | ||
} | ||
|
||
private int getUshort(int value) { | ||
return value & (int) BinaryCodec.MAX_USHORT; | ||
} | ||
|
||
/** | ||
* @return the SAMFileHeader for this record. If the header is null, the following SAMRecord methods may throw | ||
|
@@ -1854,8 +1819,6 @@ public boolean equals(final Object o) { | |
if (mInferredInsertSize != samRecord.mInferredInsertSize) return false; | ||
if (mMappingQuality != samRecord.mMappingQuality) return false; | ||
if (mMateAlignmentStart != samRecord.mMateAlignmentStart) return false; | ||
if (mIndexingBin != null ? !mIndexingBin.equals(samRecord.mIndexingBin) : samRecord.mIndexingBin != null) | ||
return false; | ||
if (mMateReferenceIndex != null ? !mMateReferenceIndex.equals(samRecord.mMateReferenceIndex) : samRecord.mMateReferenceIndex != null) | ||
return false; | ||
if (mReferenceIndex != null ? !mReferenceIndex.equals(samRecord.mReferenceIndex) : samRecord.mReferenceIndex != null) | ||
|
@@ -1896,7 +1859,6 @@ public int hashCode() { | |
result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); | ||
result = 31 * result + (mReferenceIndex != null ? mReferenceIndex.hashCode() : 0); | ||
result = 31 * result + (mMateReferenceIndex != null ? mMateReferenceIndex.hashCode() : 0); | ||
result = 31 * result + (mIndexingBin != null ? mIndexingBin.hashCode() : 0); | ||
return result; | ||
} | ||
|
||
|
@@ -2112,27 +2074,6 @@ public List<SAMValidationError> isValid(final boolean firstOnly) { | |
if (firstOnly) return ret; | ||
} | ||
|
||
if (this.getAlignmentStart() != NO_ALIGNMENT_START) { | ||
final SAMSequenceRecord sequence = getHeader() != null ? getHeader().getSequence(getReferenceName()) : null; | ||
if (sequence != null && SAMUtils.isReferenceSequenceCompatibleWithBAI(sequence)) { | ||
if (getValidationStringency() != ValidationStringency.SILENT) { | ||
if (getReferenceName() != null) { | ||
LOG.warn(String.format("%s reference length is too large for BAM bin field. %s record bin field value is incorrect.", | ||
getReferenceName(), getReadName())); | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think keeping the reference length is too long for bam warning is a good idea. It's still problem you would want to know about if you're validating your bam I think. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is actually my primary reason for the PR in the first place! With this check here, unless you set validation stringency to silent, you get a warning logged per record when reading. So I'm getting millions of warnings! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh.. of course.. that makes sense. Maybe there's a way we can have ValidateSam warn exactly once if it encounters reads like that but not have it happen in the general case of just processing a sam file with validation on. |
||
LOG.warn(String.format("Reference length is too large for BAM bin field. %s record bin field value is incorrect.", | ||
getReadName())); | ||
} | ||
} | ||
} else if (isIndexingBinNotEqualsComputedBin()) { | ||
if (ret == null) ret = new ArrayList<>(); | ||
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN, | ||
"bin field of BAM record does not equal value computed based on alignment start and end, and length of sequence to which read is aligned", | ||
getReadName())); | ||
if (firstOnly) return ret; | ||
} | ||
} | ||
|
||
if (getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) && | ||
getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { | ||
if (ret == null) ret = new ArrayList<>(); | ||
|
@@ -2165,10 +2106,6 @@ public List<SAMValidationError> isValid(final boolean firstOnly) { | |
return ret; | ||
} | ||
|
||
private boolean isIndexingBinNotEqualsComputedBin() { | ||
return this.getIndexingBin() != null && this.computeIndexingBin() != this.getIndexingBin(); | ||
} | ||
|
||
/** | ||
* Gets the source of this SAM record -- both the reader that retrieved the record and the position on disk from | ||
* whence it came. | ||
|
@@ -2282,7 +2219,6 @@ public SAMRecord deepCopy() { | |
if (null != attributes) { | ||
newSAM.setAttributes(attributes.deepCopy()); | ||
} | ||
newSAM.setIndexingBin(getIndexingBin()); | ||
|
||
return newSAM; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,7 +258,6 @@ public void testCRAMThroughBAMRoundTrip(final File originalCRAMFile, final File | |
// compare to originals | ||
int i = 0; | ||
for (SAMRecord rec : bamRecords) { | ||
rec.setIndexingBin(null); | ||
Assert.assertTrue(rec.equals(originalCRAMRecords.get(i++))); | ||
} | ||
Assert.assertEquals(i, originalCRAMRecords.size()); | ||
|
@@ -294,9 +293,6 @@ public void testBAMThroughCRAMRoundTrip() throws IOException, NoSuchAlgorithmExc | |
// retrieve all records from the bam and reset the indexing bins to keep comparisons with | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. update the comment? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've just removed the comment as all that line now does is load the records from a file and that didn't seem comment-worthy. |
||
// cram records from failing | ||
List<SAMRecord> originalBAMRecords = getSAMRecordsFromFile(originalBAMInputFile, referenceFile); | ||
for (int i = 0; i < originalBAMRecords.size(); i++) { | ||
originalBAMRecords.get(i).setIndexingBin(null); | ||
} | ||
|
||
// write the BAM records to a temporary CRAM | ||
final File tempCRAMFile = File.createTempFile("testBAMThroughCRAMRoundTrip", CramIO.CRAM_FILE_EXTENSION); | ||
|
@@ -327,9 +323,6 @@ public void testBAMThroughCRAMRoundTripViaPath() throws IOException, NoSuchAlgor | |
// retrieve all records from the bam and reset the indexing bins to keep comparisons with | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. update comment There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto |
||
// cram records from failing | ||
List<SAMRecord> originalBAMRecords = getSAMRecordsFromFile(originalBAMInputFile, referenceFile); | ||
for (int i = 0; i < originalBAMRecords.size(); i++) { | ||
originalBAMRecords.get(i).setIndexingBin(null); | ||
} | ||
|
||
// write the BAM records to a temporary CRAM | ||
try (FileSystem jimfs = Jimfs.newFileSystem(Configuration.unix())) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
format over lines, e.g.