Skip to content

Commit

Permalink
Update name separator handling.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Jan 7, 2025
1 parent 91c1cce commit 2b4ffd8
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,22 @@
// so we don't have to repeatedly interconvert them when fetching from this list

public class NameTokenisationDecode {
//TODO: lift these values to a common location since they're used by the encoder, the decoder, and the tests
// use a single byte to separate the names in the decoded buffer; this particular byte is chosen because the
// calling code in the CRAM reader for read names already assumes the decompressed data will be a block of
// BYTE_ARRAY_STOP '\0' separated names
//TODO: lift this value to a common location since its used by the encoder, the decoder, and the tests
// use a single byte to separate the names in buffer used for encoding/decoding; this particular byte is
// chosen because the calling code in the CRAM reader for read names already assumes the decompressed
// data will be a block of BYTE_ARRAY_STOP '\0' separated names
public final static byte NAME_SEPARATOR = 0;
public final static CharSequence LOCAL_NAME_SEPARATOR_CHARSEQUENCE = new String(new byte[] {NAME_SEPARATOR});

public static final int DEFAULT_POSITION_ALLOCATION = 30;

/**
* Return is a byte[] containing the read names, each separated by a NAME_SEPARATOR byte, including a terminating
* separator.
* Uncompress the compressed name data in the input buffer. Return is a byte[] containing the read names,
* each separated by the byte value specified by nameSeparator, including a terminating separator.
* @param inBuffer the buffer to uncompress
* @param nameSeparator the name separator byte to use in the output buffer
* @return the uncompressed read names
*/
//TODO: the caller needs to be able to specify the stop/separator byte via a parameter
public byte[] uncompress(final ByteBuffer inBuffer) {
public byte[] uncompress(final ByteBuffer inBuffer, final byte nameSeparator) {
inBuffer.order(ByteOrder.LITTLE_ENDIAN);
final int uncompressedLength = inBuffer.getInt();

Expand All @@ -47,7 +48,7 @@ public byte[] uncompress(final ByteBuffer inBuffer) {
final ByteBuffer decodedNames = CompressionUtils.allocateByteBuffer(uncompressedLength);
for (int i = 0; i < numNames; i++) {
decodedNames.put(decodeSingleName(tokenStreams, decodedNameTokens, i));
decodedNames.put((byte) NAME_SEPARATOR);
decodedNames.put(nameSeparator);
}
return decodedNames.array();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//TODO:its super wasteful (but simpler) to always store the accumulated tokens as Strings, since this results
// int lots of String<-> int interconversions
//TODO: enforce a maximum of 128 tokens

/**
* A very naive implementation of a name tokenization encoder.
*
Expand All @@ -46,21 +42,28 @@ public class NameTokenisationEncode {
private int maxStringValueLength; // longest *String* value for any token

/**
* Input buffer format is the read names, separated by the NAME_SEPARATOR byte, including a terminal separator.
* Compress the input buffer of read names.
* @param inBuffer formatted as read names separated by the byte specified by the nameSeparator parameter
* @param useArith true if the arithmetic coder should be used
* @param nameSeparator name separator
* @return the compressed buffer
*/
//TODO: the caller needs to be able to specify the stop byte via parameter
public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith) {
public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith, final byte nameSeparator) {
// strictly speaking, keeping this list isn't necessary, but since the first thing that we need to write
// to the output stream is the number of names, we have to scan the entire input anyway to count them,
// so just extract them while we're scanning
final List<String> namesToEncode = extractInputNames(inBuffer, CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE);
final List<String> namesToEncode = extractInputNames(
inBuffer,
CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE,
nameSeparator);
final int numNames = namesToEncode.size();
// compensate for the separator at the end of the last name that is not present in the local implementation
final int uncompressedDataSize = Integer.max(0, inBuffer.limit());

//TODO: guess max size -> str.length*2 + 10000 (from htscodecs javascript code)
// what if this is exceeded ?
final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer((inBuffer.limit()*2)+10000);
// pre-allocate the output buffer; we don't know how big it will be. instead of implementing a wrapper around
// the ByteBuffer to allow for dynamic resizing, over-allocate; if the writer ever exceeds this, writing will
// fail with an exception, but it would indicate a serious error somewhere in the writer
final int outputLen = (inBuffer.limit() * 2) + 10000; // include a constant in case input is empty
final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer(outputLen);
outBuffer.putInt(uncompressedDataSize);
outBuffer.putInt(numNames);
outBuffer.put((byte)(useArith == true ? 1 : 0));
Expand Down Expand Up @@ -91,7 +94,9 @@ public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith) {
serializeTokenStreams(streamsForPosition, outBuffer, useArith);
}

outBuffer.flip(); // set the limit to current position, and reset position to '0'
// set the limit to current position (important because we initially dramatically over-allocated the buffer,
// so make sure the caller doesn't go past the actual limit), and reset position to '0'
outBuffer.flip();
return outBuffer;
}

Expand Down Expand Up @@ -180,12 +185,14 @@ private List<EncodeToken> tokeniseName(
}

// extract the individual names from the input buffer and return in a list
// TODO: this needs a parameter to specify the separator
private static List<String> extractInputNames(final ByteBuffer inBuffer, final int preAllocationSize) {
private static List<String> extractInputNames(
final ByteBuffer inBuffer,
final int preAllocationSize,
final byte nameSeparator) {
final List<String> names = new ArrayList(preAllocationSize);
for (int lastPosition = inBuffer.position(); inBuffer.hasRemaining();) {
final byte currentByte = inBuffer.get();
if (currentByte == NameTokenisationDecode.NAME_SEPARATOR) {
if (currentByte == nameSeparator) {
final int length = inBuffer.position() - lastPosition;
final byte[] bytes = new byte[length];
inBuffer.position(lastPosition);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public class NameTokenizationInteropTest extends HtsjdkTest {

@DataProvider(name = "allNameTokInteropTests")
public Object[][] getAllNameTokenizationInteropTests() throws IOException {
// compressed path (htslib interop preCompressed file), raw (unCompressed) path, useArith (used for round tripping only)
// raw (unCompressed) path, useArith
final List<Object[]> testCases = new ArrayList<>();
for (final Path preCompressedInteropPath : getPreCompressedInteropNameTokTestPaths()) {
for (boolean useArith: new boolean[]{true, false}) {
Expand All @@ -51,16 +51,22 @@ public void testNameTokRoundTrip(
try (final InputStream unCompressedInteropStream = Files.newInputStream(unCompressedInteropPath)) {
// convert the uncompressed data from htslib to the unCompressed format used to pass data in/out of the htsjdk name tok codec
final ByteBuffer unCompressedInteropBytes = convertHTSLIBToHTSJDKStreamFormat(
ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream))
ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream)),
NameTokenisationDecode.NAME_SEPARATOR
);

// Use htsjdk to compress the uncompressed data with the provided useArith flag
final NameTokenisationEncode nameEncoder = new NameTokenisationEncode();
final ByteBuffer compressedHtsjdkBytes = nameEncoder.compress(unCompressedInteropBytes, useArith);
final ByteBuffer compressedHtsjdkBytes = nameEncoder.compress(
unCompressedInteropBytes,
useArith,
NameTokenisationDecode.NAME_SEPARATOR);

// Now use htsjdk to uncompress the data we just compressed
final NameTokenisationDecode nameDecoder = new NameTokenisationDecode();
final ByteBuffer unCompressedHtsjdkBytes = ByteBuffer.wrap(nameDecoder.uncompress(compressedHtsjdkBytes));
final ByteBuffer unCompressedHtsjdkBytes = ByteBuffer.wrap(nameDecoder.uncompress(
compressedHtsjdkBytes,
NameTokenisationDecode.NAME_SEPARATOR));

// compare to the original (ByteBuffers have to have identical positions in order to be equal (!),
// so rewind both buffers before comparing)
Expand Down Expand Up @@ -93,12 +99,15 @@ public void testNameTokUnCompress(
final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream));
// convert the uncompressed data from htslib to the unCompressed format used to pass data in/out of the htsjdk name tok codec
final ByteBuffer uncompressedInteropBytes = convertHTSLIBToHTSJDKStreamFormat(
ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream))
ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream)),
NameTokenisationDecode.NAME_SEPARATOR
);

// Use htsjdk to uncompress the precompressed file from htscodecs repo
final NameTokenisationDecode nameTokenisationDecode = new NameTokenisationDecode();
final ByteBuffer uncompressedHtsjdkBytes = ByteBuffer.wrap(nameTokenisationDecode.uncompress(preCompressedInteropBytes));
final ByteBuffer uncompressedHtsjdkBytes = ByteBuffer.wrap(
nameTokenisationDecode.uncompress(preCompressedInteropBytes, NameTokenisationDecode.NAME_SEPARATOR)
);

// Compare the htsjdk uncompressed bytes with the original input file from htscodecs repo
Assert.assertEquals(uncompressedHtsjdkBytes, uncompressedInteropBytes);
Expand Down Expand Up @@ -135,11 +144,11 @@ private static String getUnCompressedFileNameFromCompressedFileName(final String
}

// translate an htslib interop stream into the stream format used by the htsjdk name tokenization codec
private ByteBuffer convertHTSLIBToHTSJDKStreamFormat(final ByteBuffer htslibBuffer) {
private ByteBuffer convertHTSLIBToHTSJDKStreamFormat(final ByteBuffer htslibBuffer, final byte newSeparator) {
final ByteBuffer translatedBuffer = ByteBuffer.allocate(htslibBuffer.limit());
for (int i = 0; i < htslibBuffer.limit(); i++) {
if (htslibBuffer.get(i) == HTSLIB_NAME_SEPARATOR) {
translatedBuffer.put(i, NameTokenisationDecode.NAME_SEPARATOR);
translatedBuffer.put(i, newSeparator);
} else {
translatedBuffer.put(i, htslibBuffer.get(i));
}
Expand Down
Loading

0 comments on commit 2b4ffd8

Please sign in to comment.