[performance] Set an empty window for deflate blocks at gzip stream s…

…tarts In particular, this optimization also works for non-BGZF files with very small gzip stream sizes. In subsequent performance optimizations, the chunk splitting should prioritize splitting after footers if possible.
mxmlnkn · May 20, 2024 · cb93d82 · cb93d82
1 parent bf58572
commit cb93d82
Show file tree

Hide file tree

Showing 3 changed files with 148 additions and 11 deletions.
diff --git a/src/rapidgzip/chunkdecoding/Bzip2Chunk.hpp b/src/rapidgzip/chunkdecoding/Bzip2Chunk.hpp
@@ -196,7 +196,7 @@ class Bzip2Chunk
             subchunks.back().decodedSize += lastSubchunk.decodedSize;
         }
 
-        /* Ensure that all subchunks have empty windows to avoid them being filled as windows are not ncessary. */
+        /* Ensure that all subchunks have empty windows to avoid them being filled as windows are not necessary. */
         for ( auto& subchunk : subchunks ) {
             subchunk.window = std::make_shared<Window>();
         }

diff --git a/src/rapidgzip/chunkdecoding/GzipChunk.hpp b/src/rapidgzip/chunkdecoding/GzipChunk.hpp
@@ -57,31 +57,79 @@ class GzipChunk final
     }
 
     static void
-    determineUsedWindowSymbols( std::vector<Subchunk>& subchunks,
-                                BitReader&             bitReader,
-                                bool                   windowSparsity )
+    determineUsedWindowSymbolsForLastSubchunk( std::vector<Subchunk>& subchunks,
+                                               BitReader&             bitReader )
     {
         if ( subchunks.empty() || ( subchunks.back().encodedSize == 0 ) ) {
             return;
         }
 
         auto& subchunk = subchunks.back();
 
+        /* Only gather sparsity information when it is necessary (non-empty window) or may become necessary
+         * (no window yet). The window may already be initialized and empty for deflate bocks after gzip headers. */
+        if ( subchunk.window && subchunk.window->empty() ) {
+            return;
+        }
+
         try {
             /* Get the window as soon as possible to avoid costly long seeks back outside the BitReader buffer.
              * Especially, don't do it during chunk splitting because it would be too late in general. */
             const Finally seekBack{ [&bitReader, oldOffset = bitReader.tell()] () {
                 bitReader.seekTo( oldOffset ); }
             };
             bitReader.seek( subchunk.encodedOffset + subchunk.encodedSize );
-            if ( windowSparsity ) {
-                subchunk.usedWindowSymbols = deflate::getUsedWindowSymbols( bitReader );
-            }
+            subchunk.usedWindowSymbols = deflate::getUsedWindowSymbols( bitReader );
         } catch ( const std::exception& ) {
             /* Ignore errors such as EOF and even decompression errors because we are only collecting extra
              * data and might already be at the end of the given chunk size, so shouldn't return errors for
              * data thereafter. */
         }
+
+        /* Check whether the no window is needed at all. This may happen when analyzing the very first deflate
+         * block and it is at the start of a gzip stream or if the subchunk starts with a non-compressed block. */
+        const auto& usedWindowSymbols = subchunk.usedWindowSymbols;
+        if ( std::all_of( usedWindowSymbols.begin(), usedWindowSymbols.end(), [] ( bool p ) { return !p; } ) ) {
+            subchunk.usedWindowSymbols = std::vector<bool>();  // Free memory!
+            subchunk.window = std::make_shared<typename ChunkData::Window>();
+        }
+
+    }
+
+    static void
+    finalizeWindowForLastSubchunk( ChunkData&             chunk,
+                                   std::vector<Subchunk>& subchunks,
+                                   BitReader&             bitReader )
+    {
+        if ( subchunks.empty() ) {
+            return;
+        }
+
+        /* Finalize the window of the previous subchunk. Either initialize it to be empty because it is at the
+         * start of a new gzip stream and does not need a window, or determine the sparsity. Note that the very
+         * first subchunk at offset 0 cannot have a corresponding footer! */
+        bool subchunkRequiresWindow{ true };
+        const auto nextWindowOffset = subchunks.back().decodedOffset + subchunks.back().decodedSize;
+        for ( auto footer = chunk.footers.rbegin(); footer != chunk.footers.rend(); ++footer ) {
+            if ( footer->blockBoundary.decodedOffset == nextWindowOffset ) {
+                subchunkRequiresWindow = false;
+                break;
+            }
+            /* Footer are sorted ascending and we iterate in reverse order, so we can pre-emptively quit this
+             * search when we find a smaller offset than wanted. This improves performance for many footers
+             * as basically only the newly added ones since the last subchunk are checked, resulting in an
+             * overal O(n) complexity instead of O(n^2) where n is the number of footers. This is why std::find
+             * is not used. */
+            if ( footer->blockBoundary.decodedOffset < nextWindowOffset ) {
+                break;
+            }
+        }
+
+        if ( !subchunkRequiresWindow ) {
+            subchunks.back().window = std::make_shared<typename ChunkData::Window>();
+        } else if ( chunk.windowSparsity ) {
+            determineUsedWindowSymbolsForLastSubchunk( subchunks, bitReader );
+        }
     }
 
     static void
@@ -103,7 +151,7 @@ class GzipChunk final
             subchunks.back().usedWindowSymbols.clear();
         }
 
-        determineUsedWindowSymbols( subchunks, bitReader, chunk.windowSparsity );
+        finalizeWindowForLastSubchunk( chunk, subchunks, bitReader );
 
         chunk.setSubchunks( std::move( subchunks ) );
         chunk.finalize( nextBlockOffset );
@@ -128,7 +176,7 @@ class GzipChunk final
         /* Do on-the-fly chunk splitting. */
         if ( subchunks.back().decodedSize >= chunk.splitChunkSize ) {
             subchunks.back().encodedSize = encodedOffset - subchunks.back().encodedOffset;
-            determineUsedWindowSymbols( subchunks, bitReader, chunk.windowSparsity );
+            finalizeWindowForLastSubchunk( chunk, subchunks, bitReader );
             startNewSubchunk( subchunks, encodedOffset );
         }
     }

diff --git a/src/tests/rapidgzip/testParallelGzipReader.cpp b/src/tests/rapidgzip/testParallelGzipReader.cpp
@@ -890,11 +890,102 @@ testWindowPruningSimpleBase64Compression( const TemporaryDirectory& tmpFolder,
 }
 
 
+void
+testWindowPruningMultiGzipStreams( const size_t gzipStreamSize,
+                                   const size_t expectedBlockCount )
+{
+    std::vector<uint8_t> uncompressedData( gzipStreamSize );
+    fillWithRandomBase64( uncompressedData );
+    const auto compressedData = compressWithZlib( uncompressedData );
+
+    size_t blockBoundaryCount{ 0 };
+    {
+        const auto collectAllBlockBoundaries =
+            [&] ( const std::shared_ptr<ChunkData>& chunkData,
+                  [[maybe_unused]] size_t const     offsetInBlock,
+                  [[maybe_unused]] size_t const     dataToWriteSize )
+            {
+                std::cerr << "Footers:";
+                for ( const auto& footer : chunkData->footers ) {
+                    std::cerr << " " << footer.blockBoundary.encodedOffset;
+                }
+                std::cerr << "\n";
+
+                std::cerr << "Boundaries:";
+                for ( const auto& blockBoundary : chunkData->blockBoundaries ) {
+                    std::cerr << " " << blockBoundary.encodedOffset;
+                }
+                std::cerr << "\n";
+                /* The list of block boundaries does not include the very first block because it is required to
+                 * be at offset 0 relative to the chunk offset. */
+                blockBoundaryCount += chunkData->blockBoundaries.size() + 1;
+            };
+
+        ParallelGzipReader singleStreamReader( std::make_unique<BufferedFileReader>( compressedData ) );
+        singleStreamReader.read( collectAllBlockBoundaries );
+    }
+
+    const auto streamCount = ceilDiv( 1_Mi, compressedData.size() );
+    const auto fullCompressedData =
+        duplicateContents( std::vector<uint8_t>( compressedData.begin(), compressedData.end() ), streamCount );
+
+    std::cerr << "Testing window pruning for " << streamCount << " gzip streams with each " << blockBoundaryCount
+              << " deflate blocks\n";
+
+    if ( blockBoundaryCount != expectedBlockCount ) {
+        throw std::runtime_error( "The compression routine does not fullfil the test precondition." );
+    }
+
+    /* Use some prime chunk number to avoid possible exact overlap with the gzip streams! */
+    ParallelGzipReader reader( std::make_unique<BufferedFileReader>( fullCompressedData ), 0, 257_Ki );
+    const auto index = reader.gzipIndex();
+
+    /* Check that all windows are empty. */
+    REQUIRE( index.checkpoints.size() > 2 );
+    REQUIRE( static_cast<bool>( index.windows ) );
+    if ( index.windows ) {
+        REQUIRE_EQUAL( index.windows->size(), index.checkpoints.size() );
+        for ( size_t i = 0; i < index.checkpoints.size(); ++i ) {
+            const auto& checkpoint = index.checkpoints[i];
+            const auto window = index.windows->get( checkpoint.compressedOffsetInBits );
+            REQUIRE( !window || window->empty() );
+            if ( window && !window->empty() ) {
+                std::cerr << "[Error] Window " << i << " is sized " << window->decompressedSize() << " at offset: "
+                          << formatBits( checkpoint.compressedOffsetInBits ) << " out of "
+                          << index.checkpoints.size() << " checkpoints and in a compressed stream sized "
+                          << formatBytes( fullCompressedData.size() ) << " when it is expected to be empty!\n";
+            }
+        }
+    }
+}
+
+
 void
 testWindowPruning( const TemporaryDirectory& tmpFolder )
 {
     testWindowPruningSimpleBase64Compression( tmpFolder, "gzip" );
     testWindowPruningSimpleBase64Compression( tmpFolder, "bgzip" );
+
+    /* BGZF window pruning only works because all chunks are ensured to start at the first deflate block
+     * inside a gzip stream. For non-BGZF files with non-single-block gzip streams, more intricate pruning
+     * has to be implemented.
+     * For the following tests, build up a larger gzip file by concatenating gzip streams. The gzip stream
+     * size is configurable and is a proxy for the number of deflate blocks in it. For gzip stream sizes
+     * smaller than 8 KiB, it can be assumed for almost all encoders that it contains only a single block.
+     * And conversely, for gzip stream sizes > 128 KiB, it can be assumed to produce more than one block.
+     * The second argument, the number of expected blocks are not something we actually want to test for,
+     * but it is a test for the precondition of the test. If for some reason, the expected blocks differ,
+     * then simple vary the stream size for the test or implement something more stable.
+     * Note that this test does not get parallelized/chunked anyway for now because it only consists of
+     * final deflate blocks! */
+    testWindowPruningMultiGzipStreams( /* gzip stream size */ 8_Ki, /* expected blocks */ 1 );
+    /**
+     * @todo This only works when blocks are split with prioritizing end-of-stream boundaries instead of splitting
+     * only exactly when the given chunk size is exceeded. However, splitting chunks smartly is not sufficient
+     * because the chunk offsets for parallelization are fixed. We would have to add some kind of chunk merging.
+     * This seems too complicated to implement in the near-tearm as it would also affect the chunk cache!
+     */
+    //testWindowPruningMultiGzipStreams( /* gzip stream size */ 31_Ki, /* expected blocks */ 2 );
 }
 
 
@@ -912,8 +1003,6 @@ printClassSizes()
     std::cout << "  ZlibInflateWrapper            : " << sizeof( ZlibInflateWrapper ) << "\n";  // 131320
 #ifdef WITH_ISAL
     std::cout << "  IsalInflateWrapper            : " << sizeof( IsalInflateWrapper ) << "\n";  // 218592
-#endif
-#ifdef WITH_ISAL
     std::cout << "  HuffmanCodingISAL             : " << sizeof( HuffmanCodingISAL ) << "\n";  // 18916
 #endif
     /* 18916 */