apbf: Switch to fast reduce method.

This modifies the modular reduction step to make use of the same fast reduce mechanism used in the gcs filters. As can be seen in the following benchmark results, it more than doubles the performance of the primary filter operations. In order to experimentally validate the mapping adheres to the theoretical results and doesn't have any adverse effects on the false positive rates, the same validation methodology described in README.md was conducted again and said README is updated accordingly. Everything is well within the margin of error as expected. Finally, the README is also updated with the new benchmark results and the required go version is bumped in the go.mod due to the addition of the math/bits import. name old time/op new time/op delta ----------------------------------------------------------------------------- capacity=1000, fprate=0.1% -------------------------- BenchmarkAdd 158ns ± 1% 59ns ± 1% -62.59% (p=0.008 n=5+5) BenchmarkContainsTrue 183ns ± 1% 69ns ± 2% -62.27% (p=0.008 n=5+5) BenchmarkContainsFalse 61.3ns ±40% 42.0ns ±26% -31.41% (p=0.032 n=5+5) capacity=1000, fprate=0.01% --------------------------- BenchmarkAdd 211ns ± 1% 69ns ± 1% -67.07% (p=0.008 n=5+5) BenchmarkContainsTrue 236ns ± 2% 80ns ± 1% -66.16% (p=0.008 n=5+5) BenchmarkContainsFalse 59.6ns ±24% 37.7ns ± 5% -36.74% (p=0.008 n=5+5) BenchmarkReset capacity=1000, fprate=0.001% ---------------------------- BenchmarkAdd 247ns ± 0% 78ns ± 2% -68.32% (p=0.008 n=5+5) BenchmarkContainsTrue 272ns ± 1% 89ns ± 1% -67.50% (p=0.008 n=5+5) BenchmarkContainsFalse 58.6ns ±26% 37.0ns ± 4% -36.98% (p=0.008 n=5+5) capacity=100000, fprate=0.01% ----------------------------- BenchmarkAdd 205ns ± 2% 80ns ± 1% -61.12% (p=0.008 n=5+5) BenchmarkContainsTrue 219ns ± 1% 80ns ± 1% -63.39% (p=0.008 n=5+5) BenchmarkContainsFalse 70.3ns ±46% 37.6ns ±10% -46.61% (p=0.008 n=5+5) capacity=100000, fprate=0.0001% ------------------------------- BenchmarkAdd 275ns ± 2% 110ns ± 1% -60.10% (p=0.008 n=5+5) BenchmarkContainsTrue 287ns ± 1% 98ns ± 1% -65.98% (p=0.008 n=5+5) BenchmarkContainsFalse 56.6ns ±45% 36.3ns ± 6% -35.93% (p=0.008 n=5+5) capacity=100000, fprate=0.00001% -------------------------------- BenchmarkAdd 413ns ± 3% 205ns ± 2% -50.41% (p=0.016 n=5+5)
decred · Feb 2, 2021 · 320dbb7 · 320dbb7
1 parent 5e0c85e
commit 320dbb7
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 29 deletions.
diff --git a/container/apbf/README.md b/container/apbf/README.md
@@ -31,7 +31,7 @@ unbounded event stream with a tunable upper bound on the false positive rate.
 
 ### Additional Implementation Details
 
-This implementation deviates from the original paper in at least a couple of
+This implementation deviates from the original paper in at least the following
 important ways:
 
 - It uses Dillinger-Manolis enhanced double hashing instead of the more
@@ -41,6 +41,7 @@ important ways:
 - Every filter is given a unique key for the internal hashing logic so each one
   will have a unique set of false positives and that key is automatically
   changed when the filter is manually reset by the caller
+- Lemire fast reduction is used instead of standard modular reduction
 
 ## Choosing Parameters
 
@@ -87,18 +88,18 @@ described above, for the parameter selection.
 
 Capacity | Target FP | Actual Observed FP
 ---------|-----------|-------------------
-1000     |   0.1%    | 0.099%
-10000    |   0.1%    | 0.096%
+1000     |   0.1%    | 0.097%
+10000    |   0.1%    | 0.099%
 10000000 |   0.1%    | 0.1%
-1000     |   1.0%    | 0.948%
-10000    |   1.0%    | 0.868%
+1000     |   1.0%    | 0.867%
+10000    |   1.0%    | 0.862%
 10000000 |   1.0%    | 0.857%
-1000     |   2.0%    | 1.448%
-10000    |   2.0%    | 1.47%
-10000000 |   2.0%    | 1.46%
+1000     |   2.0%    | 1.464%
+10000    |   2.0%    | 1.451%
+10000000 |   2.0%    | 1.461%
 1000     |  10.0%    | 6.74%
-10000    |  10.0%    | 6.834%
-10000000 |  10.0%    | 7.027%
+10000    |  10.0%    | 6.96%
+10000000 |  10.0%    | 7.024%
 
 ## Memory Usage
 
@@ -126,32 +127,32 @@ operations.  The benchmarks are from a Ryzen 7 1700 processor.
 
 Capacity | Target FP |  Time / Op  | Allocs / Op
 ---------|-----------|-------------|------------
-1000     | 0.1%      | 159ns ± 5%  | 0
-1000     | 0.01%     | 208ns ± 0%  | 0
-1000     | 0.001%    | 245ns ± 0%  | 0
-100000   | 0.01%     | 198ns ± 3%  | 0
-100000   | 0.0001%   | 271ns ± 2%  | 0
-1000000  | 0.00001%  | 496ns ± 7%  | 0
+1000     | 0.1%      |  59ns ± 1%  | 0
+1000     | 0.01%     |  69ns ± 2%  | 0
+1000     | 0.001%    |  78ns ± 2%  | 0
+100000   | 0.01%     |  80ns ± 1%  | 0
+100000   | 0.0001%   | 110ns ± 1%  | 0
+1000000  | 0.00001%  | 205ns ± 2%  | 0
 
 ### `Contains` (item matches filter, worst case)
 
 Capacity | Target FP |  Time / Op  | Allocs / Op
 ---------|-----------|-------------|------------
-1000     | 0.1%      | 175ns ± 1%  | 0
-1000     | 0.01%     | 228ns ± 1%  | 0
-1000     | 0.001%    | 267ns ± 1%  | 0
-100000   | 0.01%     | 211ns ± 1%  | 0
-100000   | 0.0001%   | 282ns ± 1%  | 0
+1000     | 0.1%      |  69ns ± 2%  | 0
+1000     | 0.01%     |  80ns ± 1%  | 0
+1000     | 0.001%    |  89ns ± 1%  | 0
+100000   | 0.01%     |  80ns ± 1%  | 0
+100000   | 0.0001%   |  98ns ± 1%  | 0
 
 ### `Contains` (item does NOT match filter)
 
 Capacity | Target FP |  Time / Op  | Allocs / Op
 ---------|-----------|-------------|------------
-1000     | 0.1%      | 45.4ns ± 0% | 0
-1000     | 0.01%     | 57.6ns ±21% | 0
-1000     | 0.001%    | 56.2ns ±19% | 0
-100000   | 0.01%     | 48.7ns ±15% | 0
-100000   | 0.0001%   | 49.6ns ±19% | 0
+1000     | 0.1%      | 42.0ns ±26% | 0
+1000     | 0.01%     | 37.7ns ±5%  | 0
+1000     | 0.001%    | 37.0ns ±4%  | 0
+100000   | 0.01%     | 37.6ns ±10% | 0
+100000   | 0.0001%   | 36.3ns ±6%  | 0
 
 ## Installation and Updating
 

diff --git a/container/apbf/filter.go b/container/apbf/filter.go
@@ -10,6 +10,7 @@ package apbf
 import (
 	"encoding/binary"
 	"math"
+	"math/bits"
 	"sync"
 	"time"
 
@@ -302,6 +303,34 @@ func (f *Filter) setBit(bit uint64) {
 	f.data[bit>>3] |= 1 << (bit & 7)
 }
 
+// fastReduce calculates a mapping that is more or less equivalent to x mod N.
+// However, instead of using a mod operation that can lead to slowness on many
+// processors when not using a power of two due to unnecessary division, this
+// uses a "multiply-and-shift" trick that eliminates all divisions as described
+// in a blog post by Daniel Lemire, located at the following site at the time
+// of this writing:
+// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+//
+// Since that link might disappear, the general idea is to multiply by N and
+// shift right by log2(N).  Since N is a 64-bit integer in this case, it
+// becomes:
+//
+// (x * N) / 2^64 == (x * N) >> 64
+//
+// This is a fair map since it maps integers in the range [0,2^64) to multiples
+// of N in [0, N*2^64) and then divides by 2^64 to map all multiples of N in
+// [0,2^64) to 0, all multiples of N in [2^64, 2*2^64) to 1, etc.  This results
+// in either ceil(2^64/N) or floor(2^64/N) multiples of N.
+func fastReduce(x, N uint64) uint64 {
+	// This uses math/bits to perform the 128-bit multiplication as the compiler
+	// will replace it with the relevant intrinsic on most architectures.
+	//
+	// The high 64 bits in a 128-bit product is the same as shifting the entire
+	// product right by 64 bits.
+	hi, _ := bits.Mul64(x, N)
+	return hi
+}
+
 // Add inserts the provided data into the filter.
 //
 // This function is safe for concurrent access.
@@ -335,7 +364,7 @@ func (f *Filter) Add(data []byte) {
 	hash1, hash2 := siphash.Hash128(f.key0, f.key1, data)
 	derivedIdx, acc := deriveIndex(logicalSlice, hash1, hash2)
 	for i := uint8(0); i < f.k; i++ {
-		f.setBit(sliceBitOffset + derivedIdx%f.bitsPerSlice)
+		f.setBit(sliceBitOffset + fastReduce(derivedIdx, f.bitsPerSlice))
 
 		// Move to the next logical slice while wrapping around the ring buffer
 		// if needed.
@@ -404,7 +433,7 @@ func (f *Filter) Contains(data []byte) bool {
 	hash1, hash2 := siphash.Hash128(f.key0, f.key1, data)
 	derivedIdx, acc := deriveIndex(logicalSlice, hash1, hash2)
 	for {
-		if f.isBitSet(sliceBitOffset + derivedIdx%f.bitsPerSlice) {
+		if f.isBitSet(sliceBitOffset + fastReduce(derivedIdx, f.bitsPerSlice)) {
 			// Successful query when the required number of consecutive matches
 			// is achieved.
 			curMatches++

diff --git a/container/apbf/go.mod b/container/apbf/go.mod
@@ -1,5 +1,5 @@
 module github.com/decred/dcrd/container/apbf
 
-go 1.11
+go 1.13
 
 require github.com/dchest/siphash v1.2.2