Skip to content

Commit

Permalink
[performance] Set an empty window for deflate blocks at gzip stream s…
Browse files Browse the repository at this point in the history
…tarts

In particular, this optimization also works for non-BGZF files with very
small gzip stream sizes. In subsequent performance optimizations, the
chunk splitting should prioritize splitting after footers if possible.
  • Loading branch information
mxmlnkn committed May 20, 2024
1 parent bf58572 commit cb93d82
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 11 deletions.
2 changes: 1 addition & 1 deletion src/rapidgzip/chunkdecoding/Bzip2Chunk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ class Bzip2Chunk
subchunks.back().decodedSize += lastSubchunk.decodedSize;
}

/* Ensure that all subchunks have empty windows to avoid them being filled as windows are not ncessary. */
/* Ensure that all subchunks have empty windows to avoid them being filled as windows are not necessary. */
for ( auto& subchunk : subchunks ) {
subchunk.window = std::make_shared<Window>();
}
Expand Down
64 changes: 56 additions & 8 deletions src/rapidgzip/chunkdecoding/GzipChunk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,31 +57,79 @@ class GzipChunk final
}

static void
determineUsedWindowSymbols( std::vector<Subchunk>& subchunks,
BitReader& bitReader,
bool windowSparsity )
determineUsedWindowSymbolsForLastSubchunk( std::vector<Subchunk>& subchunks,
BitReader& bitReader )
{
if ( subchunks.empty() || ( subchunks.back().encodedSize == 0 ) ) {
return;
}

auto& subchunk = subchunks.back();

/* Only gather sparsity information when it is necessary (non-empty window) or may become necessary
* (no window yet). The window may already be initialized and empty for deflate bocks after gzip headers. */
if ( subchunk.window && subchunk.window->empty() ) {
return;
}

try {
/* Get the window as soon as possible to avoid costly long seeks back outside the BitReader buffer.
* Especially, don't do it during chunk splitting because it would be too late in general. */
const Finally seekBack{ [&bitReader, oldOffset = bitReader.tell()] () {
bitReader.seekTo( oldOffset ); }
};
bitReader.seek( subchunk.encodedOffset + subchunk.encodedSize );
if ( windowSparsity ) {
subchunk.usedWindowSymbols = deflate::getUsedWindowSymbols( bitReader );
}
subchunk.usedWindowSymbols = deflate::getUsedWindowSymbols( bitReader );
} catch ( const std::exception& ) {
/* Ignore errors such as EOF and even decompression errors because we are only collecting extra
* data and might already be at the end of the given chunk size, so shouldn't return errors for
* data thereafter. */
}

/* Check whether the no window is needed at all. This may happen when analyzing the very first deflate
* block and it is at the start of a gzip stream or if the subchunk starts with a non-compressed block. */
const auto& usedWindowSymbols = subchunk.usedWindowSymbols;
if ( std::all_of( usedWindowSymbols.begin(), usedWindowSymbols.end(), [] ( bool p ) { return !p; } ) ) {
subchunk.usedWindowSymbols = std::vector<bool>(); // Free memory!
subchunk.window = std::make_shared<typename ChunkData::Window>();
}

}

static void
finalizeWindowForLastSubchunk( ChunkData& chunk,
std::vector<Subchunk>& subchunks,
BitReader& bitReader )
{
if ( subchunks.empty() ) {
return;
}

/* Finalize the window of the previous subchunk. Either initialize it to be empty because it is at the
* start of a new gzip stream and does not need a window, or determine the sparsity. Note that the very
* first subchunk at offset 0 cannot have a corresponding footer! */
bool subchunkRequiresWindow{ true };
const auto nextWindowOffset = subchunks.back().decodedOffset + subchunks.back().decodedSize;
for ( auto footer = chunk.footers.rbegin(); footer != chunk.footers.rend(); ++footer ) {
if ( footer->blockBoundary.decodedOffset == nextWindowOffset ) {
subchunkRequiresWindow = false;
break;
}
/* Footer are sorted ascending and we iterate in reverse order, so we can pre-emptively quit this
* search when we find a smaller offset than wanted. This improves performance for many footers
* as basically only the newly added ones since the last subchunk are checked, resulting in an
* overal O(n) complexity instead of O(n^2) where n is the number of footers. This is why std::find
* is not used. */
if ( footer->blockBoundary.decodedOffset < nextWindowOffset ) {
break;
}
}

if ( !subchunkRequiresWindow ) {
subchunks.back().window = std::make_shared<typename ChunkData::Window>();
} else if ( chunk.windowSparsity ) {
determineUsedWindowSymbolsForLastSubchunk( subchunks, bitReader );
}
}

static void
Expand All @@ -103,7 +151,7 @@ class GzipChunk final
subchunks.back().usedWindowSymbols.clear();
}

determineUsedWindowSymbols( subchunks, bitReader, chunk.windowSparsity );
finalizeWindowForLastSubchunk( chunk, subchunks, bitReader );

chunk.setSubchunks( std::move( subchunks ) );
chunk.finalize( nextBlockOffset );
Expand All @@ -128,7 +176,7 @@ class GzipChunk final
/* Do on-the-fly chunk splitting. */
if ( subchunks.back().decodedSize >= chunk.splitChunkSize ) {
subchunks.back().encodedSize = encodedOffset - subchunks.back().encodedOffset;
determineUsedWindowSymbols( subchunks, bitReader, chunk.windowSparsity );
finalizeWindowForLastSubchunk( chunk, subchunks, bitReader );
startNewSubchunk( subchunks, encodedOffset );
}
}
Expand Down
93 changes: 91 additions & 2 deletions src/tests/rapidgzip/testParallelGzipReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,11 +890,102 @@ testWindowPruningSimpleBase64Compression( const TemporaryDirectory& tmpFolder,
}


void
testWindowPruningMultiGzipStreams( const size_t gzipStreamSize,
const size_t expectedBlockCount )
{
std::vector<uint8_t> uncompressedData( gzipStreamSize );
fillWithRandomBase64( uncompressedData );
const auto compressedData = compressWithZlib( uncompressedData );

size_t blockBoundaryCount{ 0 };
{
const auto collectAllBlockBoundaries =
[&] ( const std::shared_ptr<ChunkData>& chunkData,
[[maybe_unused]] size_t const offsetInBlock,
[[maybe_unused]] size_t const dataToWriteSize )
{
std::cerr << "Footers:";
for ( const auto& footer : chunkData->footers ) {
std::cerr << " " << footer.blockBoundary.encodedOffset;
}
std::cerr << "\n";

std::cerr << "Boundaries:";
for ( const auto& blockBoundary : chunkData->blockBoundaries ) {
std::cerr << " " << blockBoundary.encodedOffset;
}
std::cerr << "\n";
/* The list of block boundaries does not include the very first block because it is required to
* be at offset 0 relative to the chunk offset. */
blockBoundaryCount += chunkData->blockBoundaries.size() + 1;
};

ParallelGzipReader singleStreamReader( std::make_unique<BufferedFileReader>( compressedData ) );
singleStreamReader.read( collectAllBlockBoundaries );
}

const auto streamCount = ceilDiv( 1_Mi, compressedData.size() );
const auto fullCompressedData =
duplicateContents( std::vector<uint8_t>( compressedData.begin(), compressedData.end() ), streamCount );

std::cerr << "Testing window pruning for " << streamCount << " gzip streams with each " << blockBoundaryCount
<< " deflate blocks\n";

if ( blockBoundaryCount != expectedBlockCount ) {
throw std::runtime_error( "The compression routine does not fullfil the test precondition." );
}

/* Use some prime chunk number to avoid possible exact overlap with the gzip streams! */
ParallelGzipReader reader( std::make_unique<BufferedFileReader>( fullCompressedData ), 0, 257_Ki );
const auto index = reader.gzipIndex();

/* Check that all windows are empty. */
REQUIRE( index.checkpoints.size() > 2 );
REQUIRE( static_cast<bool>( index.windows ) );
if ( index.windows ) {
REQUIRE_EQUAL( index.windows->size(), index.checkpoints.size() );
for ( size_t i = 0; i < index.checkpoints.size(); ++i ) {
const auto& checkpoint = index.checkpoints[i];
const auto window = index.windows->get( checkpoint.compressedOffsetInBits );
REQUIRE( !window || window->empty() );
if ( window && !window->empty() ) {
std::cerr << "[Error] Window " << i << " is sized " << window->decompressedSize() << " at offset: "
<< formatBits( checkpoint.compressedOffsetInBits ) << " out of "
<< index.checkpoints.size() << " checkpoints and in a compressed stream sized "
<< formatBytes( fullCompressedData.size() ) << " when it is expected to be empty!\n";
}
}
}
}


void
testWindowPruning( const TemporaryDirectory& tmpFolder )
{
testWindowPruningSimpleBase64Compression( tmpFolder, "gzip" );
testWindowPruningSimpleBase64Compression( tmpFolder, "bgzip" );

/* BGZF window pruning only works because all chunks are ensured to start at the first deflate block
* inside a gzip stream. For non-BGZF files with non-single-block gzip streams, more intricate pruning
* has to be implemented.
* For the following tests, build up a larger gzip file by concatenating gzip streams. The gzip stream
* size is configurable and is a proxy for the number of deflate blocks in it. For gzip stream sizes
* smaller than 8 KiB, it can be assumed for almost all encoders that it contains only a single block.
* And conversely, for gzip stream sizes > 128 KiB, it can be assumed to produce more than one block.
* The second argument, the number of expected blocks are not something we actually want to test for,
* but it is a test for the precondition of the test. If for some reason, the expected blocks differ,
* then simple vary the stream size for the test or implement something more stable.
* Note that this test does not get parallelized/chunked anyway for now because it only consists of
* final deflate blocks! */
testWindowPruningMultiGzipStreams( /* gzip stream size */ 8_Ki, /* expected blocks */ 1 );
/**
* @todo This only works when blocks are split with prioritizing end-of-stream boundaries instead of splitting
* only exactly when the given chunk size is exceeded. However, splitting chunks smartly is not sufficient
* because the chunk offsets for parallelization are fixed. We would have to add some kind of chunk merging.
* This seems too complicated to implement in the near-tearm as it would also affect the chunk cache!
*/
//testWindowPruningMultiGzipStreams( /* gzip stream size */ 31_Ki, /* expected blocks */ 2 );
}


Expand All @@ -912,8 +1003,6 @@ printClassSizes()
std::cout << " ZlibInflateWrapper : " << sizeof( ZlibInflateWrapper ) << "\n"; // 131320
#ifdef WITH_ISAL
std::cout << " IsalInflateWrapper : " << sizeof( IsalInflateWrapper ) << "\n"; // 218592
#endif
#ifdef WITH_ISAL
std::cout << " HuffmanCodingISAL : " << sizeof( HuffmanCodingISAL ) << "\n"; // 18916
#endif
/* 18916 */
Expand Down

0 comments on commit cb93d82

Please sign in to comment.