Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix restoration of read base feature code. #1590

Merged
merged 2 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.build.Utils;
import htsjdk.samtools.cram.encoding.readfeatures.*;
import htsjdk.samtools.util.SequenceUtil;
Expand Down Expand Up @@ -437,30 +438,45 @@ public static byte[] restoreReadBases(
final InsertBase insert = (InsertBase) variation;
bases[posInRead++ - 1] = insert.getBase();
break;
case Bases.operator:
final Bases readBases = (Bases) variation;
for (byte b : readBases.getBases()) {
bases[posInRead++ - 1] = b;
}
break;
case RefSkip.operator:
posInSeq += ((RefSkip) variation).getLength();
break;
case Bases.operator:
case ReadBase.operator:
break; // defer until after the reference bases are retrieved
case Scores.operator:
case BaseQualityScore.operator:
break; // handled by resolveQualityScores
case Padding.operator:
case HardClip.operator:
break; // handled by getCigarForReadFeatures
default: throw new CRAMException(String.format("Unrecognized read feature code: %c", variation.getOperator()));
}
}

for (; posInRead <= readLength
&& alignmentStart + posInSeq - zeroBasedReferenceOffset < referenceBases.length; posInRead++, posInSeq++) {
bases[posInRead - 1] = referenceBases[alignmentStart + posInSeq - zeroBasedReferenceOffset];
if (referenceBases != null) {
for (; posInRead <= readLength
&& alignmentStart + posInSeq - zeroBasedReferenceOffset < referenceBases.length; posInRead++, posInSeq++) {
bases[posInRead - 1] = referenceBases[alignmentStart + posInSeq - zeroBasedReferenceOffset];
}
}

// ReadBase overwrites bases:
// ReadBase and Bases feature codes overwrite bases:
for (final ReadFeature variation : variations) {
switch (variation.getOperator()) {
case ReadBase.operator:
final ReadBase readBase = (ReadBase) variation;
bases[variation.getPosition() - 1] = readBase.getBase();
break;
case Bases.operator:
final Bases basesOp = (Bases) variation;
System.arraycopy(
basesOp.getBases(),
0,
bases,
variation.getPosition() - 1,
basesOp.getBases().length);
break;
default:
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@
import htsjdk.HtsjdkTest;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.cram.structure.*;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;

public class CRAMRecordReadFeaturesTest extends HtsjdkTest {

@DataProvider(name = "cigarTest")
Expand Down Expand Up @@ -35,4 +41,50 @@ public void testCigarFidelity(
Assert.assertEquals(cigar.toString(), expectedCigarString);
}

@DataProvider(name="readFeatureTestData")
private Object[][] getReadFeatureTestData() {
final String testDir = "src/test/resources/htsjdk/samtools/cram/";
return new Object[][]{
// cram, sam, reference (may be null)

// test CRAM file taken from the CRAM test files in hts-specs, has reads with ReadBase ('B') and
// Bases ('b') feature codes
{testDir + "0503_mapped.cram", testDir + "0503_mapped.sam", testDir + "ce.fa"},

// test CRAM file (provided as part of https://github.com/samtools/htsjdk/issues/1379) does not
// use reference-based compression (requires no reference) and uses Bases ('b') and SoftClip ('S')
// feature codes; with the sam file created from the cram via samtools
{testDir + "referenceNotRequired.cram", testDir + "referenceNotRequired.sam", null}
};
}

@Test(dataProvider = "readFeatureTestData")
private void readFeatureTest(
final String cramFileName,
final String samFileName,
final String referenceFileName
) throws IOException {
// ensure these are handled correctly on read by comparing the SAMRecords created when reading the
// CRAM with the SAMRecords from the corresponding truth SAM (see https://github.com/samtools/htsjdk/issues/1379)
final File testCRAM = new File(cramFileName);
final File testSAM = new File(samFileName);
final File referenceFile = referenceFileName == null ? null : new File(referenceFileName);

try (final SamReader cramReader = SamReaderFactory.make().referenceSequence(referenceFile).open(testCRAM);
final SamReader samReader = SamReaderFactory.make().referenceSequence(referenceFile).open(testSAM)) {

final SAMRecordIterator cramIterator = cramReader.iterator();
final SAMRecordIterator samIterator = samReader.iterator();
while (samIterator.hasNext() && cramIterator.hasNext()) {
final SAMRecord samRecord = samIterator.next();
final SAMRecord cramRecord = cramIterator.next();

Assert.assertEquals(samRecord.getReadBases(), cramRecord.getReadBases());
Assert.assertEquals(samRecord.getBaseQualities(), cramRecord.getBaseQualities());
Assert.assertEquals(samRecord.getCigarString(), cramRecord.getCigarString());
}
Assert.assertEquals(samIterator.hasNext(), cramIterator.hasNext());
}
}

}
Binary file not shown.
4 changes: 4 additions & 0 deletions src/test/resources/htsjdk/samtools/cram/0503_mapped.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd UR:/nfs/users/nfs_j/jkb/work/samtools_master/hts-specs/test/CRAM/passed/ce.fa
@PG ID:samtools PN:samtools VN:1.14 CL:samtools view -h -T src/test/resources/htsjdk/samtools/cram/ce.fa 0503_mapped.cram
match 99 CHROMOSOME_I 1000 40 100M = 1200 300 RTTTTTCGGGTTTTTTGAAATGAATATCGTAGCTACAGAAACGGTTGTGCACTCATCTGAAAGTTTGTTTTTCTTGTTTTCTTGCACTTTGTGCAGAATR #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC MD:Z:0A98T0 NM:i:2
match 147 CHROMOSOME_I 1200 40 100M = 1000 -300 YYGTTTTAGAAAAATTATTTTTAAGAATTTTTCATTTTAGGAATATTGTTATTTCAGAAAATAGCTAAATGTGATTTCTGTAATTTTGCCTGCCAAAGYY #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC MD:Z:0T0T0T94T0T0C0 NM:i:6
Loading