From 6ce1fee71b6142850a71d7d56722f618a467c810 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 4 Jan 2021 13:43:47 -0800 Subject: [PATCH] Don't shrink window log when streaming with a dictionary Fixes #2442. 1. When creating a dictionary keep the same behavior as before. Assume the source size is 513 bytes when adjusting parameters. 2. When calling ZSTD_getCParams() or ZSTD_adjustCParams() keep the same behavior as before. 3. When attaching a dictionary keep the same behavior of ignoring the dictionary size. When streaming this will select the largest parameters and not adjust them down. But, the CDict will use the correctly sized parameters, which seems like the right tradeoff. 4. When not attaching a dictionary (either forced not to, or using a prefix dictionary) we select parameters based on the dictionary size + source size, and assume the source size is small, which is the same behavior as before. But, now we don't adjust the window log (and hash and chain log) down when the source size is unknown. When the source size is unknown all cdicts should attach, except when the user disables attaching, or `forceWindow` is used. This means that when streaming with a CDict we end up in the good case where we get small CDict parameters, and large source parameters. TODO: Add a streaming + dictionary regression test case. --- lib/compress/zstd_compress.c | 28 ++++++++++++++++++++++------ tests/regression/results.csv | 20 ++++++++++---------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 2e0edaf99cd..c565fdd351f 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1188,15 +1188,30 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); assert(ZSTD_checkCParams(cPar)==0); - if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) - srcSize = minSrcSize; - switch (mode) { case ZSTD_cpm_noAttachDict: + /* If we don't know the source size, don't make any + * assumptions about it. We will already have selected + * smaller parameters if a dictionary is in use. + */ + break; case ZSTD_cpm_unknown: + /* Keep the legacy behavior of assuming small source + * sizes when the cparam mode is unkown. + */ + /* fall-through */ case ZSTD_cpm_createCDict: + /* Assume a small source size when creating a dictionary + * with an unkown source size. + */ + if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) + srcSize = minSrcSize; break; case ZSTD_cpm_attachDict: + /* Dictionary has its own dedicated parameters which have + * already been selected. We are selecting parameters + * for only the source. + */ dictSize = 0; break; default: @@ -1213,7 +1228,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, ZSTD_highbit32(tSize-1) + 1; if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; } - { U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); + if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1; if (cycleLog > dictAndWindowLog) @@ -2955,9 +2971,9 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, } /* ZSTD_writeSkippableFrame_advanced() : - * Writes out a skippable frame with the specified magic number variant (16 are supported), + * Writes out a skippable frame with the specified magic number variant (16 are supported), * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data. - * + * * Returns the total number of bytes written, or a ZSTD error code. */ size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, diff --git a/tests/regression/results.csv b/tests/regression/results.csv index 587b6afbe45..0f818149b9f 100644 --- a/tests/regression/results.csv +++ b/tests/regression/results.csv @@ -216,7 +216,7 @@ github.tar, level 16 with dict, zstdcli, github.tar, level 19, zstdcli, 32841 github.tar, level 19 with dict, zstdcli, 32899 github.tar, no source size, zstdcli, 38442 -github.tar, no source size with dict, zstdcli, 213380 +github.tar, no source size with dict, zstdcli, 38004 github.tar, long distance mode, zstdcli, 39680 github.tar, multithreaded, zstdcli, 38445 github.tar, multithreaded long distance mode, zstdcli, 39680 @@ -618,7 +618,7 @@ github.tar, level 16 with dict, advanced github.tar, level 19, advanced streaming, 32837 github.tar, level 19 with dict, advanced streaming, 32895 github.tar, no source size, advanced streaming, 38438 -github.tar, no source size with dict, advanced streaming, 214381 +github.tar, no source size with dict, advanced streaming, 38000 github.tar, long distance mode, advanced streaming, 39676 github.tar, multithreaded, advanced streaming, 38441 github.tar, multithreaded long distance mode, advanced streaming, 39676 @@ -695,7 +695,7 @@ github, level 16 with dict, old stre github, level 19, old streaming, 134064 github, level 19 with dict, old streaming, 37576 github, no source size, old streaming, 140632 -github, no source size with dict, old streaming, 40678 +github, no source size with dict, old streaming, 40654 github, uncompressed literals, old streaming, 136335 github, uncompressed literals optimal, old streaming, 134064 github, huffman literals, old streaming, 175568 @@ -728,7 +728,7 @@ github.tar, level 16 with dict, old stre github.tar, level 19, old streaming, 32837 github.tar, level 19 with dict, old streaming, 32895 github.tar, no source size, old streaming, 38438 -github.tar, no source size with dict, old streaming, 214384 +github.tar, no source size with dict, old streaming, 38000 github.tar, uncompressed literals, old streaming, 38441 github.tar, uncompressed literals optimal, old streaming, 32837 github.tar, huffman literals, old streaming, 42465 @@ -813,7 +813,7 @@ github, level 16 with dict, old stre github, level 19, old streaming advanced, 134064 github, level 19 with dict, old streaming advanced, 37576 github, no source size, old streaming advanced, 140632 -github, no source size with dict, old streaming advanced, 40643 +github, no source size with dict, old streaming advanced, 40608 github, long distance mode, old streaming advanced, 141104 github, multithreaded, old streaming advanced, 141104 github, multithreaded long distance mode, old streaming advanced, 141104 @@ -854,7 +854,7 @@ github.tar, level 16 with dict, old stre github.tar, level 19, old streaming advanced, 32837 github.tar, level 19 with dict, old streaming advanced, 32876 github.tar, no source size, old streaming advanced, 38438 -github.tar, no source size with dict, old streaming advanced, 214384 +github.tar, no source size with dict, old streaming advanced, 38015 github.tar, long distance mode, old streaming advanced, 38441 github.tar, multithreaded, old streaming advanced, 38441 github.tar, multithreaded long distance mode, old streaming advanced, 38441 @@ -880,7 +880,7 @@ github, level 9 with dict, old stre github, level 13 with dict, old streaming cdcit, 39743 github, level 16 with dict, old streaming cdcit, 37577 github, level 19 with dict, old streaming cdcit, 37576 -github, no source size with dict, old streaming cdcit, 40678 +github, no source size with dict, old streaming cdcit, 40654 github.tar, level -5 with dict, old streaming cdcit, 45018 github.tar, level -3 with dict, old streaming cdcit, 41886 github.tar, level -1 with dict, old streaming cdcit, 41636 @@ -895,7 +895,7 @@ github.tar, level 9 with dict, old stre github.tar, level 13 with dict, old streaming cdcit, 36372 github.tar, level 16 with dict, old streaming cdcit, 39353 github.tar, level 19 with dict, old streaming cdcit, 32676 -github.tar, no source size with dict, old streaming cdcit, 214384 +github.tar, no source size with dict, old streaming cdcit, 38000 github, level -5 with dict, old streaming advanced cdict, 49562 github, level -3 with dict, old streaming advanced cdict, 44956 github, level -1 with dict, old streaming advanced cdict, 42383 @@ -910,7 +910,7 @@ github, level 9 with dict, old stre github, level 13 with dict, old streaming advanced cdict, 39731 github, level 16 with dict, old streaming advanced cdict, 40789 github, level 19 with dict, old streaming advanced cdict, 37576 -github, no source size with dict, old streaming advanced cdict, 40643 +github, no source size with dict, old streaming advanced cdict, 40608 github.tar, level -5 with dict, old streaming advanced cdict, 44307 github.tar, level -3 with dict, old streaming advanced cdict, 41359 github.tar, level -1 with dict, old streaming advanced cdict, 41322 @@ -925,4 +925,4 @@ github.tar, level 9 with dict, old stre github.tar, level 13 with dict, old streaming advanced cdict, 36035 github.tar, level 16 with dict, old streaming advanced cdict, 38736 github.tar, level 19 with dict, old streaming advanced cdict, 32876 -github.tar, no source size with dict, old streaming advanced cdict, 214384 +github.tar, no source size with dict, old streaming advanced cdict, 38015