From 1b6cf3696e719ad6494a1cfb3498b7cb9454350c Mon Sep 17 00:00:00 2001 From: Andy Balholm Date: Tue, 9 Jan 2024 06:29:08 -0800 Subject: [PATCH] matchfinder: remove MultiHash It was an interesting experiment, but it didn't do any better than M4. --- brotli_test.go | 24 ---- matchfinder/multihash.go | 246 --------------------------------------- 2 files changed, 270 deletions(-) delete mode 100644 matchfinder/multihash.go diff --git a/brotli_test.go b/brotli_test.go index 108ec07..6c4820a 100644 --- a/brotli_test.go +++ b/brotli_test.go @@ -693,27 +693,3 @@ func BenchmarkEncodeM4Chain64(b *testing.B) { func BenchmarkEncodeM4Chain128(b *testing.B) { benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 128, HashLen: 5, DistanceBitCost: 57}, 1<<16) } - -func TestEncodeMultiHash6(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, DistanceBitCost: 57, HashLengths: []int{6}}, 1<<16) -} - -func TestEncodeMultiHash6_8(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, DistanceBitCost: 57, HashLengths: []int{6, 8}}, 1<<16) -} - -func BenchmarkEncodeMultiHash6(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{6}}, 1<<16) -} - -func BenchmarkEncodeMultiHash5_8(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 8}}, 1<<16) -} - -func BenchmarkEncodeMultiHash5_7_9(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 7, 9}}, 1<<16) -} - -func BenchmarkEncodeMultiHash5_6_7_9(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 6, 7, 9}}, 1<<16) -} diff --git a/matchfinder/multihash.go b/matchfinder/multihash.go deleted file mode 100644 index 369b99d..0000000 --- a/matchfinder/multihash.go +++ /dev/null @@ -1,246 +0,0 @@ -package matchfinder - -import ( - "encoding/binary" - "math/bits" - "sort" -) - -// MultiHash is an implementation of the MatchFinder -// interface that uses multiple hashes of different lengths. -type MultiHash struct { - // MaxDistance is the maximum distance (in bytes) to look back for - // a match. The default is 65535. - MaxDistance int - - // MinLength is the length of the shortest match to return. - // The default is 4. - MinLength int - - // HashLengths is a list of the hashes to use, with the number of - // bytes to use for each. For example, to to use 4-byte, 7-byte, and - // 10-byte hashes, set HashLengths to []int{4, 7, 10}. - // The minimum length is 4. - HashLengths []int - - // TableBits is the number of bits in the hash table indexes. - // The default is 17 (128K entries). - TableBits int - - // DistanceBitCost is used when comparing two matches to see - // which is better. The comparison is primarily based on the length - // of the matches, but it can also take the distance into account, - // in terms of the number of bits needed to represent the distance. - // One byte of length is given a score of 256, so 32 (256/8) would - // be a reasonable first guess for the value of one bit. - // (The default is 0, which bases the comparison solely on length.) - DistanceBitCost int - - tables [][]uint32 - - history []byte -} - -func (q *MultiHash) Reset() { - for _, t := range q.tables { - for i := range t { - t[i] = 0 - } - } - q.history = q.history[:0] -} - -func (q *MultiHash) score(m absoluteMatch) int { - return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost -} - -func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match { - if q.MaxDistance == 0 { - q.MaxDistance = 65535 - } - if q.MinLength == 0 { - q.MinLength = 4 - } - if q.TableBits == 0 { - q.TableBits = 17 - } - if len(q.tables) < len(q.HashLengths) { - q.tables = make([][]uint32, len(q.HashLengths)) - for i := range q.tables { - q.tables[i] = make([]uint32, 1< q.MaxDistance*2 { - // Trim down the history buffer. - delta := len(q.history) - q.MaxDistance - copy(q.history, q.history[delta:]) - q.history = q.history[:q.MaxDistance] - - for _, t := range q.tables { - for i, v := range t { - newV := int(v) - delta - if newV < 0 { - newV = 0 - } - t[i] = uint32(newV) - } - } - } - - // Append src to the history buffer. - e.NextEmit = len(q.history) - q.history = append(q.history, src...) - src = q.history - - // matches stores the matches that have been found but not emitted, - // in reverse order. (matches[0] is the most recent one.) - var matches [3]absoluteMatch - - candidates := make([]int, len(q.HashLengths)) - - for i := e.NextEmit; i < len(src)-maxHashLen; i++ { - if matches[0] != (absoluteMatch{}) && i >= matches[0].End { - // We have found some matches, and we're far enough along that we probably - // won't find overlapping matches, so we might as well emit them. - if matches[1] != (absoluteMatch{}) { - e.trim(matches[1], matches[0].Start, q.MinLength) - } - e.emit(matches[0]) - matches = [3]absoluteMatch{} - } - - // Calculate and store the hashes. - h := uint32(0x811c9dc5) // FNV-32 offset basis - nb := 0 - for j, hashLen := range q.HashLengths { - for nb < hashLen { - h ^= uint32(src[i+nb]) - h *= 0x01000193 // FNV-32 prime - nb++ - } - index := h >> (32 - q.TableBits) - candidates[j] = int(q.tables[j][index]) - q.tables[j][index] = uint32(i) - } - - // Look for a match. - var currentMatch absoluteMatch - - if i < matches[0].End { - // If we're looking for an overlapping match, we only need to check the - // hash that ends 2 bytes after the end of the previous match. - for j, candidate := range candidates { - if i+q.HashLengths[j] != matches[0].End+2 { - continue - } - if candidate == 0 || i-candidate > q.MaxDistance { - break - } - if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) { - break - } - m := extendMatch2(src, i, candidate, e.NextEmit) - if m.End-m.Start >= q.HashLengths[j] { - currentMatch = m - } - } - } else { - for j, candidate := range candidates { - if candidate == 0 || i-candidate > q.MaxDistance { - break - } - if i-candidate == matches[0].Start-matches[0].Match { - // Don't bother to check for the same match we already have. - continue - } - if currentMatch.End-currentMatch.Start > q.HashLengths[j] { - // Don't bother with hashes that are shorter than the current match. - continue - } - if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) { - break - } - m := extendMatch2(src, i, candidate, e.NextEmit) - if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) { - currentMatch = m - } - } - } - - if currentMatch.End-currentMatch.Start < q.MinLength { - continue - } - - overlapPenalty := 0 - if matches[0] != (absoluteMatch{}) { - overlapPenalty = 275 - if currentMatch.Start <= matches[1].End { - // This match would completely replace the previous match, - // so there is no penalty for overlap. - overlapPenalty = 0 - } - } - - if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty { - continue - } - - matches = [3]absoluteMatch{ - currentMatch, - matches[0], - matches[1], - } - - if matches[2] == (absoluteMatch{}) { - continue - } - - // We have three matches, so it's time to emit one and/or eliminate one. - switch { - case matches[0].Start < matches[2].End: - // The first and third matches overlap; discard the one in between. - matches = [3]absoluteMatch{ - matches[0], - matches[2], - absoluteMatch{}, - } - - case matches[0].Start < matches[2].End+q.MinLength: - // The first and third matches don't overlap, but there's no room for - // another match between them. Emit the first match and discard the second. - e.emit(matches[2]) - matches = [3]absoluteMatch{ - matches[0], - absoluteMatch{}, - absoluteMatch{}, - } - - default: - // Emit the first match, shortening it if necessary to avoid overlap with the second. - e.trim(matches[2], matches[1].Start, q.MinLength) - matches[2] = absoluteMatch{} - } - } - - // We've found all the matches now; emit the remaining ones. - if matches[1] != (absoluteMatch{}) { - e.trim(matches[1], matches[0].Start, q.MinLength) - } - if matches[0] != (absoluteMatch{}) { - e.emit(matches[0]) - } - - dst = e.Dst - if e.NextEmit < len(src) { - dst = append(dst, Match{ - Unmatched: len(src) - e.NextEmit, - }) - } - - return dst -}