Skip to content

Commit

Permalink
Fix restoration of read base feature code. #1379 (#1590)
Browse files Browse the repository at this point in the history
* Fix #1379
* Fix decoding of CRAM Bases read features.
* Tighten up read feature handling.
  • Loading branch information
cmnbroad authored Feb 7, 2022
1 parent c0b61a2 commit b5af659
Show file tree
Hide file tree
Showing 5 changed files with 2,668 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.build.Utils;
import htsjdk.samtools.cram.encoding.readfeatures.*;
import htsjdk.samtools.util.SequenceUtil;
Expand Down Expand Up @@ -437,30 +438,45 @@ public static byte[] restoreReadBases(
final InsertBase insert = (InsertBase) variation;
bases[posInRead++ - 1] = insert.getBase();
break;
case Bases.operator:
final Bases readBases = (Bases) variation;
for (byte b : readBases.getBases()) {
bases[posInRead++ - 1] = b;
}
break;
case RefSkip.operator:
posInSeq += ((RefSkip) variation).getLength();
break;
case Bases.operator:
case ReadBase.operator:
break; // defer until after the reference bases are retrieved
case Scores.operator:
case BaseQualityScore.operator:
break; // handled by resolveQualityScores
case Padding.operator:
case HardClip.operator:
break; // handled by getCigarForReadFeatures
default: throw new CRAMException(String.format("Unrecognized read feature code: %c", variation.getOperator()));
}
}

for (; posInRead <= readLength
&& alignmentStart + posInSeq - zeroBasedReferenceOffset < referenceBases.length; posInRead++, posInSeq++) {
bases[posInRead - 1] = referenceBases[alignmentStart + posInSeq - zeroBasedReferenceOffset];
if (referenceBases != null) {
for (; posInRead <= readLength
&& alignmentStart + posInSeq - zeroBasedReferenceOffset < referenceBases.length; posInRead++, posInSeq++) {
bases[posInRead - 1] = referenceBases[alignmentStart + posInSeq - zeroBasedReferenceOffset];
}
}

// ReadBase overwrites bases:
// ReadBase and Bases feature codes overwrite bases:
for (final ReadFeature variation : variations) {
switch (variation.getOperator()) {
case ReadBase.operator:
final ReadBase readBase = (ReadBase) variation;
bases[variation.getPosition() - 1] = readBase.getBase();
break;
case Bases.operator:
final Bases basesOp = (Bases) variation;
System.arraycopy(
basesOp.getBases(),
0,
bases,
variation.getPosition() - 1,
basesOp.getBases().length);
break;
default:
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@
import htsjdk.HtsjdkTest;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.cram.structure.*;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;

public class CRAMRecordReadFeaturesTest extends HtsjdkTest {

@DataProvider(name = "cigarTest")
Expand Down Expand Up @@ -35,4 +41,50 @@ public void testCigarFidelity(
Assert.assertEquals(cigar.toString(), expectedCigarString);
}

@DataProvider(name="readFeatureTestData")
private Object[][] getReadFeatureTestData() {
final String testDir = "src/test/resources/htsjdk/samtools/cram/";
return new Object[][]{
// cram, sam, reference (may be null)

// test CRAM file taken from the CRAM test files in hts-specs, has reads with ReadBase ('B') and
// Bases ('b') feature codes
{testDir + "0503_mapped.cram", testDir + "0503_mapped.sam", testDir + "ce.fa"},

// test CRAM file (provided as part of https://github.com/samtools/htsjdk/issues/1379) does not
// use reference-based compression (requires no reference) and uses Bases ('b') and SoftClip ('S')
// feature codes; with the sam file created from the cram via samtools
{testDir + "referenceNotRequired.cram", testDir + "referenceNotRequired.sam", null}
};
}

@Test(dataProvider = "readFeatureTestData")
private void readFeatureTest(
final String cramFileName,
final String samFileName,
final String referenceFileName
) throws IOException {
// ensure these are handled correctly on read by comparing the SAMRecords created when reading the
// CRAM with the SAMRecords from the corresponding truth SAM (see https://github.com/samtools/htsjdk/issues/1379)
final File testCRAM = new File(cramFileName);
final File testSAM = new File(samFileName);
final File referenceFile = referenceFileName == null ? null : new File(referenceFileName);

try (final SamReader cramReader = SamReaderFactory.make().referenceSequence(referenceFile).open(testCRAM);
final SamReader samReader = SamReaderFactory.make().referenceSequence(referenceFile).open(testSAM)) {

final SAMRecordIterator cramIterator = cramReader.iterator();
final SAMRecordIterator samIterator = samReader.iterator();
while (samIterator.hasNext() && cramIterator.hasNext()) {
final SAMRecord samRecord = samIterator.next();
final SAMRecord cramRecord = cramIterator.next();

Assert.assertEquals(samRecord.getReadBases(), cramRecord.getReadBases());
Assert.assertEquals(samRecord.getBaseQualities(), cramRecord.getBaseQualities());
Assert.assertEquals(samRecord.getCigarString(), cramRecord.getCigarString());
}
Assert.assertEquals(samIterator.hasNext(), cramIterator.hasNext());
}
}

}
Binary file not shown.
4 changes: 4 additions & 0 deletions src/test/resources/htsjdk/samtools/cram/0503_mapped.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd UR:/nfs/users/nfs_j/jkb/work/samtools_master/hts-specs/test/CRAM/passed/ce.fa
@PG ID:samtools PN:samtools VN:1.14 CL:samtools view -h -T src/test/resources/htsjdk/samtools/cram/ce.fa 0503_mapped.cram
match 99 CHROMOSOME_I 1000 40 100M = 1200 300 RTTTTTCGGGTTTTTTGAAATGAATATCGTAGCTACAGAAACGGTTGTGCACTCATCTGAAAGTTTGTTTTTCTTGTTTTCTTGCACTTTGTGCAGAATR #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC MD:Z:0A98T0 NM:i:2
match 147 CHROMOSOME_I 1200 40 100M = 1000 -300 YYGTTTTAGAAAAATTATTTTTAAGAATTTTTCATTTTAGGAATATTGTTATTTCAGAAAATAGCTAAATGTGATTTCTGTAATTTTGCCTGCCAAAGYY #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC MD:Z:0T0T0T94T0T0C0 NM:i:6
Loading

0 comments on commit b5af659

Please sign in to comment.