diff --git a/snappy-internal.h b/snappy-internal.h index 39fbda5..ae78247 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -334,6 +334,31 @@ static inline std::pair FindMatchLength(const char* s1, } #endif +static inline size_t FindMatchLengthPlain(const char* s1, const char* s2, + const char* s2_limit) { + // Implementation based on the x86-64 version, above. + assert(s2_limit >= s2); + int matched = 0; + + while (s2 <= s2_limit - 8 && + UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) { + s2 += 8; + matched += 8; + } + if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 8) { + uint64_t x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched); + int matching_bits = Bits::FindLSBSetNonZero64(x); + matched += matching_bits >> 3; + s2 += matching_bits >> 3; + } else { + while ((s2 < s2_limit) && (s1[matched] == *s2)) { + ++s2; + ++matched; + } + } + return matched; +} + // Lookup tables for decompression code. Give --snappy_dump_decompression_table // to the unit test to recompute char_table. diff --git a/snappy.cc b/snappy.cc index 50fd3a1..c8ec170 100644 --- a/snappy.cc +++ b/snappy.cc @@ -175,6 +175,22 @@ inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) { (hash & mask)); } +inline uint16_t* TableEntry4ByteMatch(uint16_t* table, uint32_t bytes, + uint32_t mask) { + constexpr uint32_t kMagic = 2654435761U; + const uint32_t hash = (kMagic * bytes) >> (32 - kMaxHashTableBits); + return reinterpret_cast(reinterpret_cast(table) + + (hash & mask)); +} + +inline uint16_t* TableEntry8ByteMatch(uint16_t* table, uint64_t bytes, + uint32_t mask) { + constexpr uint64_t kMagic = 58295818150454627ULL; + const uint32_t hash = (kMagic * bytes) >> (64 - kMaxHashTableBits); + return reinterpret_cast(reinterpret_cast(table) + + (hash & mask)); +} + } // namespace size_t MaxCompressedLength(size_t source_bytes) { @@ -931,6 +947,172 @@ char* CompressFragment(const char* input, size_t input_size, char* op, } } +emit_remainder: + // Emit the remaining bytes as a literal + if (ip < ip_end) { + op = EmitLiteral(op, ip, ip_end - ip); + } + + return op; +} + +char* CompressFragmentDoubleHash(const char* input, size_t input_size, char* op, + uint16_t* table, const int table_size, + uint16_t* table2, const int table_size2) { + // "ip" is the input pointer, and "op" is the output pointer. + const char* ip = input; + assert(input_size <= kBlockSize); + assert((table_size & (table_size - 1)) == 0); // table must be power of two + const uint32_t mask = 2 * (table_size - 1); + const char* ip_end = input + input_size; + const char* base_ip = ip; + + const size_t kInputMarginBytes = 15; + if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) { + const char* ip_limit = input + input_size - kInputMarginBytes; + + for (;;) { + const char* next_emit = ip++; + uint64_t data = LittleEndian::Load64(ip); + uint32_t skip = 512; + + const char* candidate; + uint32_t candidate_length; + while (true) { + assert(static_cast(data) == LittleEndian::Load32(ip)); + uint16_t* table_entry2 = TableEntry8ByteMatch(table2, data, mask); + uint32_t bytes_between_hash_lookups = skip >> 9; + skip++; + const char* next_ip = ip + bytes_between_hash_lookups; + if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) { + ip = next_emit; + goto emit_remainder; + } + candidate = base_ip + *table_entry2; + assert(candidate >= base_ip); + assert(candidate < ip); + + *table_entry2 = ip - base_ip; + if (SNAPPY_PREDICT_FALSE(static_cast(data) == + LittleEndian::Load32(candidate))) { + candidate_length = + FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; + break; + } + + uint16_t* table_entry = TableEntry4ByteMatch(table, data, mask); + candidate = base_ip + *table_entry; + assert(candidate >= base_ip); + assert(candidate < ip); + + *table_entry = ip - base_ip; + if (SNAPPY_PREDICT_FALSE(static_cast(data) == + LittleEndian::Load32(candidate))) { + candidate_length = + FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; + table_entry2 = + TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask); + auto candidate2 = base_ip + *table_entry2; + size_t candidate_length2 = + FindMatchLengthPlain(candidate2, ip + 1, ip_end); + if (candidate_length2 > candidate_length) { + *table_entry2 = ip - base_ip; + candidate = candidate2; + candidate_length = candidate_length2; + ++ip; + } + break; + } + data = LittleEndian::Load64(next_ip); + ip = next_ip; + } + // Backtrack to the point it matches fully. + while (ip > next_emit && candidate > base_ip && + *(ip - 1) == *(candidate - 1)) { + --ip; + --candidate; + ++candidate_length; + } + *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask) = + ip - base_ip + 1; + *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 2), mask) = + ip - base_ip + 2; + *TableEntry4ByteMatch(table, LittleEndian::Load32(ip + 1), mask) = + ip - base_ip + 1; + // Step 2: A 4-byte or 8-byte match has been found. + // We'll later see if more than 4 bytes match. But, prior to the match, + // input bytes [next_emit, ip) are unmatched. Emit them as + // "literal bytes." + assert(next_emit + 16 <= ip_end); + if (ip - next_emit > 0) { + op = EmitLiteral(op, next_emit, + ip - next_emit); + } + // Step 3: Call EmitCopy, and then see if another EmitCopy could + // be our next move. Repeat until we find no match for the + // input immediately after what was consumed by the last EmitCopy call. + // + // If we exit this loop normally then we need to call EmitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can exit + // this loop via goto if we get close to exhausting the input. + do { + // We have a 4-byte match at ip, and no need to emit any + // "literal bytes" prior to ip. + const char* base = ip; + ip += candidate_length; + size_t offset = base - candidate; + if (candidate_length < 12) { + op = + EmitCopy(op, offset, candidate_length); + } else { + op = EmitCopy(op, offset, + candidate_length); + } + if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) { + goto emit_remainder; + } + // We are now looking for a 4-byte match again. We read + // table[Hash(ip, mask)] for that. To improve compression, + // we also update several previous table entries. + if (ip - base_ip > 7) { + *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 7), mask) = + ip - base_ip - 7; + *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 4), mask) = + ip - base_ip - 4; + } + *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 3), mask) = + ip - base_ip - 3; + *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 2), mask) = + ip - base_ip - 2; + *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 2), mask) = + ip - base_ip - 2; + *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 1), mask) = + ip - base_ip - 1; + + uint16_t* table_entry = + TableEntry8ByteMatch(table2, LittleEndian::Load64(ip), mask); + candidate = base_ip + *table_entry; + *table_entry = ip - base_ip; + if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) { + candidate_length = + FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; + continue; + } + table_entry = + TableEntry4ByteMatch(table, LittleEndian::Load32(ip), mask); + candidate = base_ip + *table_entry; + *table_entry = ip - base_ip; + if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) { + candidate_length = + FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; + continue; + } + break; + } while (true); + } + } + emit_remainder: // Emit the remaining bytes as a literal if (ip < ip_end) { @@ -1608,7 +1790,8 @@ bool GetUncompressedLength(Source* source, uint32_t* result) { return decompressor.ReadUncompressedLength(result); } -size_t Compress(Source* reader, Sink* writer) { +size_t Compress(Source* reader, Sink* writer, CompressionOptions options) { + CHECK(options.level == 1 || options.level == 2); int token = 0; size_t written = 0; size_t N = reader->Available(); @@ -1664,8 +1847,15 @@ size_t Compress(Source* reader, Sink* writer) { // Need a scratch buffer for the output, in case the byte sink doesn't // have room for us directly. char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput()); - char* end = internal::CompressFragment(fragment, fragment_size, dest, table, - table_size); + char* end = nullptr; + if (options.level == 1) { + end = internal::CompressFragment(fragment, fragment_size, dest, table, + table_size); + } else if (options.level == 2) { + end = internal::CompressFragmentDoubleHash( + fragment, fragment_size, dest, table, table_size >> 1, + table + (table_size >> 1), table_size >> 1); + } writer->Append(dest, end - dest); written += (end - dest); @@ -2107,39 +2297,40 @@ bool IsValidCompressed(Source* compressed) { } void RawCompress(const char* input, size_t input_length, char* compressed, - size_t* compressed_length) { + size_t* compressed_length, CompressionOptions options) { ByteArraySource reader(input, input_length); UncheckedByteArraySink writer(compressed); - Compress(&reader, &writer); + Compress(&reader, &writer, options); // Compute how many bytes were added *compressed_length = (writer.CurrentDestination() - compressed); } void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length, - char* compressed, size_t* compressed_length) { + char* compressed, size_t* compressed_length, + CompressionOptions options) { SnappyIOVecReader reader(iov, uncompressed_length); UncheckedByteArraySink writer(compressed); - Compress(&reader, &writer); + Compress(&reader, &writer, options); // Compute how many bytes were added. *compressed_length = writer.CurrentDestination() - compressed; } -size_t Compress(const char* input, size_t input_length, - std::string* compressed) { +size_t Compress(const char* input, size_t input_length, std::string* compressed, + CompressionOptions options) { // Pre-grow the buffer to the max length of the compressed output STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length)); size_t compressed_length; RawCompress(input, input_length, string_as_array(compressed), - &compressed_length); + &compressed_length, options); compressed->erase(compressed_length); return compressed_length; } size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt, - std::string* compressed) { + std::string* compressed, CompressionOptions options) { // Compute the number of bytes to be compressed. size_t uncompressed_length = 0; for (size_t i = 0; i < iov_cnt; ++i) { @@ -2152,7 +2343,7 @@ size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt, size_t compressed_length; RawCompressFromIOVec(iov, uncompressed_length, string_as_array(compressed), - &compressed_length); + &compressed_length, options); compressed->erase(compressed_length); return compressed_length; } diff --git a/snappy.h b/snappy.h index e12b658..04b7a96 100644 --- a/snappy.h +++ b/snappy.h @@ -50,13 +50,33 @@ namespace snappy { class Source; class Sink; + struct CompressionOptions { + // Compression level. + // Level 1 is the fastest + // Level 2 is a little slower but provides better compression. Level 2 is + // **EXPERIMENTAL** for the time being. It might happen that we decide to + // fall back to level 1 in the future. + // Levels 3+ are currently not supported. We plan to support levels up to + // 9 in the future. + // If you played with other compression algorithms, level 1 is equivalent to + // fast mode (level 1) of LZ4, level 2 is equivalent to LZ4's level 2 mode + // and compresses somewhere around zstd:-3 and zstd:-2 but generally with + // faster decompression speeds than snappy:1 and zstd:-3. + int level = DefaultCompressionLevel(); + + static constexpr int MinCompressionLevel() { return 1; } + static constexpr int MaxCompressionLevel() { return 2; } + static constexpr int DefaultCompressionLevel() { return 1; } + }; + // ------------------------------------------------------------------------ // Generic compression/decompression routines. // ------------------------------------------------------------------------ - // Compress the bytes read from "*source" and append to "*sink". Return the + // Compress the bytes read from "*reader" and append to "*writer". Return the // number of bytes written. - size_t Compress(Source* source, Sink* sink); + size_t Compress(Source* reader, Sink* writer, + CompressionOptions options = {}); // Find the uncompressed length of the given stream, as given by the header. // Note that the true length could deviate from this; the stream could e.g. @@ -76,14 +96,15 @@ namespace snappy { // // REQUIRES: "input[]" is not an alias of "*compressed". size_t Compress(const char* input, size_t input_length, - std::string* compressed); + std::string* compressed, CompressionOptions options = {}); // Same as `Compress` above but taking an `iovec` array as input. Note that // this function preprocesses the inputs to compute the sum of // `iov[0..iov_cnt-1].iov_len` before reading. To avoid this, use // `RawCompressFromIOVec` below. size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt, - std::string* compressed); + std::string* compressed, + CompressionOptions options = {}); // Decompresses "compressed[0..compressed_length-1]" to "*uncompressed". // Original contents of "*uncompressed" are lost. @@ -126,16 +147,15 @@ namespace snappy { // RawCompress(input, input_length, output, &output_length); // ... Process(output, output_length) ... // delete [] output; - void RawCompress(const char* input, - size_t input_length, - char* compressed, - size_t* compressed_length); + void RawCompress(const char* input, size_t input_length, char* compressed, + size_t* compressed_length, CompressionOptions options = {}); // Same as `RawCompress` above but taking an `iovec` array as input. Note that // `uncompressed_length` is the total number of bytes to be read from the // elements of `iov` (_not_ the number of elements in `iov`). void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length, - char* compressed, size_t* compressed_length); + char* compressed, size_t* compressed_length, + CompressionOptions options = {}); // Given data in "compressed[0..compressed_length-1]" generated by // calling the Snappy::Compress routine, this routine @@ -215,7 +235,7 @@ namespace snappy { static constexpr int kMinHashTableBits = 8; static constexpr size_t kMinHashTableSize = 1 << kMinHashTableBits; - static constexpr int kMaxHashTableBits = 14; + static constexpr int kMaxHashTableBits = 15; static constexpr size_t kMaxHashTableSize = 1 << kMaxHashTableBits; } // end namespace snappy diff --git a/snappy_benchmark.cc b/snappy_benchmark.cc index 28570dd..90243a8 100644 --- a/snappy_benchmark.cc +++ b/snappy_benchmark.cc @@ -44,6 +44,15 @@ namespace snappy { namespace { +void FilesAndLevels(::testing::Benchmark* benchmark) { + for (int i = 0; i < ARRAYSIZE(kTestDataFiles); ++i) { + for (int level = snappy::CompressionOptions::MinCompressionLevel(); + level <= snappy::CompressionOptions::MaxCompressionLevel(); ++level) { + benchmark->ArgPair(i, level); + } + } +} + void BM_UFlat(benchmark::State& state) { // Pick file to process based on state.range(0). int file_index = state.range(0); @@ -55,7 +64,8 @@ void BM_UFlat(benchmark::State& state) { kTestDataFiles[file_index].size_limit); std::string zcontents; - snappy::Compress(contents.data(), contents.size(), &zcontents); + snappy::Compress(contents.data(), contents.size(), &zcontents, + snappy::CompressionOptions{.level = state.range(1)}); char* dst = new char[contents.size()]; for (auto s : state) { @@ -68,7 +78,7 @@ void BM_UFlat(benchmark::State& state) { delete[] dst; } -BENCHMARK(BM_UFlat)->DenseRange(0, ARRAYSIZE(kTestDataFiles) - 1); +BENCHMARK(BM_UFlat)->Apply(FilesAndLevels); struct SourceFiles { SourceFiles() { @@ -119,7 +129,8 @@ void BM_UValidate(benchmark::State& state) { kTestDataFiles[file_index].size_limit); std::string zcontents; - snappy::Compress(contents.data(), contents.size(), &zcontents); + snappy::Compress(contents.data(), contents.size(), &zcontents, + snappy::CompressionOptions{.level = state.range(1)}); for (auto s : state) { CHECK(snappy::IsValidCompressedBuffer(zcontents.data(), zcontents.size())); @@ -128,7 +139,7 @@ void BM_UValidate(benchmark::State& state) { static_cast(contents.size())); state.SetLabel(kTestDataFiles[file_index].label); } -BENCHMARK(BM_UValidate)->DenseRange(0, ARRAYSIZE(kTestDataFiles) - 1); +BENCHMARK(BM_UValidate)->Apply(FilesAndLevels); void BM_UValidateMedley(benchmark::State& state) { static const SourceFiles* const source = new SourceFiles(); @@ -152,6 +163,7 @@ BENCHMARK(BM_UValidateMedley); void BM_UIOVecSource(benchmark::State& state) { // Pick file to process based on state.range(0). int file_index = state.range(0); + int level = state.range(1); CHECK_GE(file_index, 0); CHECK_LT(file_index, ARRAYSIZE(kTestDataFiles)); @@ -180,7 +192,8 @@ void BM_UIOVecSource(benchmark::State& state) { char* dst = new char[snappy::MaxCompressedLength(contents.size())]; size_t zsize = 0; for (auto s : state) { - snappy::RawCompressFromIOVec(iov, contents.size(), dst, &zsize); + snappy::RawCompressFromIOVec(iov, contents.size(), dst, &zsize, + snappy::CompressionOptions{.level = level}); benchmark::DoNotOptimize(iov); } state.SetBytesProcessed(static_cast(state.iterations()) * @@ -195,7 +208,7 @@ void BM_UIOVecSource(benchmark::State& state) { delete[] dst; } -BENCHMARK(BM_UIOVecSource)->DenseRange(0, ARRAYSIZE(kTestDataFiles) - 1); +BENCHMARK(BM_UIOVecSource)->Apply(FilesAndLevels); void BM_UIOVecSink(benchmark::State& state) { // Pick file to process based on state.range(0). @@ -254,7 +267,8 @@ void BM_UFlatSink(benchmark::State& state) { kTestDataFiles[file_index].size_limit); std::string zcontents; - snappy::Compress(contents.data(), contents.size(), &zcontents); + snappy::Compress(contents.data(), contents.size(), &zcontents, + snappy::CompressionOptions{.level = state.range(1)}); char* dst = new char[contents.size()]; for (auto s : state) { @@ -273,11 +287,12 @@ void BM_UFlatSink(benchmark::State& state) { delete[] dst; } -BENCHMARK(BM_UFlatSink)->DenseRange(0, ARRAYSIZE(kTestDataFiles) - 1); +BENCHMARK(BM_UFlatSink)->Apply(FilesAndLevels); void BM_ZFlat(benchmark::State& state) { // Pick file to process based on state.range(0). int file_index = state.range(0); + int level = state.range(1); CHECK_GE(file_index, 0); CHECK_LT(file_index, ARRAYSIZE(kTestDataFiles)); @@ -288,7 +303,8 @@ void BM_ZFlat(benchmark::State& state) { size_t zsize = 0; for (auto s : state) { - snappy::RawCompress(contents.data(), contents.size(), dst, &zsize); + snappy::RawCompress(contents.data(), contents.size(), dst, &zsize, + snappy::CompressionOptions{.level = level}); benchmark::DoNotOptimize(dst); } state.SetBytesProcessed(static_cast(state.iterations()) * @@ -302,10 +318,12 @@ void BM_ZFlat(benchmark::State& state) { zsize); delete[] dst; } -BENCHMARK(BM_ZFlat)->DenseRange(0, ARRAYSIZE(kTestDataFiles) - 1); + +BENCHMARK(BM_ZFlat)->Apply(FilesAndLevels); void BM_ZFlatAll(benchmark::State& state) { const int num_files = ARRAYSIZE(kTestDataFiles); + int level = state.range(0); std::vector contents(num_files); std::vector dst(num_files); @@ -322,7 +340,7 @@ void BM_ZFlatAll(benchmark::State& state) { for (auto s : state) { for (int i = 0; i < num_files; ++i) { snappy::RawCompress(contents[i].data(), contents[i].size(), dst[i], - &zsize); + &zsize, snappy::CompressionOptions{.level = level}); benchmark::DoNotOptimize(dst); } } @@ -335,10 +353,11 @@ void BM_ZFlatAll(benchmark::State& state) { } state.SetLabel(StrFormat("%d kTestDataFiles", num_files)); } -BENCHMARK(BM_ZFlatAll); +BENCHMARK(BM_ZFlatAll)->DenseRange(1, 2); void BM_ZFlatIncreasingTableSize(benchmark::State& state) { CHECK_GT(ARRAYSIZE(kTestDataFiles), 0); + int level = state.range(0); const std::string base_content = ReadTestDataFile( kTestDataFiles[0].filename, kTestDataFiles[0].size_limit); @@ -358,7 +377,7 @@ void BM_ZFlatIncreasingTableSize(benchmark::State& state) { for (auto s : state) { for (size_t i = 0; i < contents.size(); ++i) { snappy::RawCompress(contents[i].data(), contents[i].size(), dst[i], - &zsize); + &zsize, snappy::CompressionOptions{.level = level}); benchmark::DoNotOptimize(dst); } } @@ -371,7 +390,7 @@ void BM_ZFlatIncreasingTableSize(benchmark::State& state) { } state.SetLabel(StrFormat("%d tables", contents.size())); } -BENCHMARK(BM_ZFlatIncreasingTableSize); +BENCHMARK(BM_ZFlatIncreasingTableSize)->DenseRange(1, 2); } // namespace diff --git a/snappy_compress_fuzzer.cc b/snappy_compress_fuzzer.cc index 1d4429a..144f438 100644 --- a/snappy_compress_fuzzer.cc +++ b/snappy_compress_fuzzer.cc @@ -39,22 +39,26 @@ // Entry point for LibFuzzer. extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { std::string input(reinterpret_cast(data), size); + for (int level = snappy::CompressionOptions::MinCompressionLevel(); + level <= snappy::CompressionOptions::MaxCompressionLevel(); ++level) { + std::string compressed; + size_t compressed_size = + snappy::Compress(input.data(), input.size(), &compressed, + snappy::CompressionOptions{.level = level}); - std::string compressed; - size_t compressed_size = - snappy::Compress(input.data(), input.size(), &compressed); + (void)compressed_size; // Variable only used in debug builds. + assert(compressed_size == compressed.size()); + assert(compressed.size() <= snappy::MaxCompressedLength(input.size())); + assert( + snappy::IsValidCompressedBuffer(compressed.data(), compressed.size())); - (void)compressed_size; // Variable only used in debug builds. - assert(compressed_size == compressed.size()); - assert(compressed.size() <= snappy::MaxCompressedLength(input.size())); - assert(snappy::IsValidCompressedBuffer(compressed.data(), compressed.size())); + std::string uncompressed_after_compress; + bool uncompress_succeeded = snappy::Uncompress( + compressed.data(), compressed.size(), &uncompressed_after_compress); - std::string uncompressed_after_compress; - bool uncompress_succeeded = snappy::Uncompress( - compressed.data(), compressed.size(), &uncompressed_after_compress); - - (void)uncompress_succeeded; // Variable only used in debug builds. - assert(uncompress_succeeded); - assert(input == uncompressed_after_compress); + (void)uncompress_succeeded; // Variable only used in debug builds. + assert(uncompress_succeeded); + assert(input == uncompressed_after_compress); + } return 0; }