-
Notifications
You must be signed in to change notification settings - Fork 6.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow fractional bits/key in BloomFilterPolicy #6092
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,9 +27,10 @@ namespace { | |
// See description in FastLocalBloomImpl | ||
class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { | ||
public: | ||
FastLocalBloomBitsBuilder(const int bits_per_key, const int num_probes) | ||
: bits_per_key_(bits_per_key), num_probes_(num_probes) { | ||
assert(bits_per_key_); | ||
FastLocalBloomBitsBuilder(const int millibits_per_key) | ||
: millibits_per_key_(millibits_per_key), | ||
num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) { | ||
assert(millibits_per_key >= 1000); | ||
} | ||
|
||
// No Copy allowed | ||
|
@@ -77,14 +78,15 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { | |
|
||
int CalculateNumEntry(const uint32_t bytes) override { | ||
uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0; | ||
return static_cast<int>(uint64_t{8} * bytes_no_meta / bits_per_key_); | ||
return static_cast<int>(uint64_t{8000} * bytes_no_meta / | ||
millibits_per_key_); | ||
} | ||
|
||
uint32_t CalculateSpace(const int num_entry) override { | ||
uint32_t num_cache_lines = 0; | ||
if (bits_per_key_ > 0 && num_entry > 0) { | ||
if (millibits_per_key_ > 0 && num_entry > 0) { | ||
num_cache_lines = static_cast<uint32_t>( | ||
(int64_t{num_entry} * bits_per_key_ + 511) / 512); | ||
(int64_t{num_entry} * millibits_per_key_ + 511999) / 512000); | ||
} | ||
return num_cache_lines * 64 + /*metadata*/ 5; | ||
} | ||
|
@@ -136,7 +138,7 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { | |
} | ||
} | ||
|
||
int bits_per_key_; | ||
int millibits_per_key_; | ||
int num_probes_; | ||
std::vector<uint64_t> hash_entries_; | ||
}; | ||
|
@@ -187,7 +189,7 @@ using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>; | |
|
||
class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { | ||
public: | ||
explicit LegacyBloomBitsBuilder(const int bits_per_key, const int num_probes); | ||
explicit LegacyBloomBitsBuilder(const int bits_per_key); | ||
|
||
// No Copy allowed | ||
LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete; | ||
|
@@ -208,7 +210,6 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { | |
} | ||
|
||
private: | ||
friend class FullFilterBlockTest_DuplicateEntries_Test; | ||
int bits_per_key_; | ||
int num_probes_; | ||
std::vector<uint32_t> hash_entries_; | ||
|
@@ -228,9 +229,9 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { | |
void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); | ||
}; | ||
|
||
LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, | ||
const int num_probes) | ||
: bits_per_key_(bits_per_key), num_probes_(num_probes) { | ||
LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key) | ||
: bits_per_key_(bits_per_key), | ||
num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)) { | ||
assert(bits_per_key_); | ||
} | ||
|
||
|
@@ -412,12 +413,24 @@ const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = { | |
kAuto, | ||
}; | ||
|
||
BloomFilterPolicy::BloomFilterPolicy(int bits_per_key, Mode mode) | ||
: bits_per_key_(bits_per_key), mode_(mode) { | ||
// We intentionally round down to reduce probing cost a little bit | ||
num_probes_ = static_cast<int>(bits_per_key_ * 0.69); // 0.69 =~ ln(2) | ||
if (num_probes_ < 1) num_probes_ = 1; | ||
if (num_probes_ > 30) num_probes_ = 30; | ||
BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) | ||
: mode_(mode) { | ||
// Sanitize bits_per_key | ||
if (bits_per_key < 1.0) { | ||
bits_per_key = 1.0; | ||
} else if (!(bits_per_key < 100.0)) { // including NaN | ||
bits_per_key = 100.0; | ||
} | ||
|
||
// Includes a nudge toward rounding up, to ensure on all platforms | ||
// that doubles specified with three decimal digits after the decimal | ||
// point are interpreted accurately. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No objection to the code, but just a question: do we need the result to be exactly the same for every platform? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, we don't need that in any schema compatibility sense. It's just a judgment call on what kind of surprises we want to avoid. |
||
millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001); | ||
|
||
// For better or worse, this is a rounding up of a nudged rounding up, | ||
// e.g. 7.4999999999999 will round up to 8, but that provides more | ||
// predictability against small arithmetic errors in floating point. | ||
whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the harm of not having There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could do that. I just thought it somewhat safer not to unlock new behaviors in the legacy filters. Forward-compatibility should work but that's more manual effort to test. This way, format_version alone can back out of potential issues. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are not breaking back compatibility by doing it, right? Integer value should still generate the same value no matter what. |
||
} | ||
|
||
BloomFilterPolicy::~BloomFilterPolicy() {} | ||
|
@@ -433,7 +446,7 @@ void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, | |
assert(mode_ == kDeprecatedBlock); | ||
|
||
// Compute bloom filter size (in both bits and bytes) | ||
uint32_t bits = static_cast<uint32_t>(n * bits_per_key_); | ||
uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_); | ||
|
||
// For small n, we can see a very high false positive rate. Fix it | ||
// by enforcing a minimum bloom filter length. | ||
|
@@ -442,12 +455,15 @@ void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, | |
uint32_t bytes = (bits + 7) / 8; | ||
bits = bytes * 8; | ||
|
||
int num_probes = | ||
LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_); | ||
|
||
const size_t init_size = dst->size(); | ||
dst->resize(init_size + bytes, 0); | ||
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes | ||
dst->push_back(static_cast<char>(num_probes)); // Remember # of probes | ||
char* array = &(*dst)[init_size]; | ||
for (int i = 0; i < n; i++) { | ||
LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_, | ||
LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes, | ||
array); | ||
} | ||
} | ||
|
@@ -470,7 +486,7 @@ bool BloomFilterPolicy::KeyMayMatch(const Slice& key, | |
// Consider it a match. | ||
return true; | ||
} | ||
// NB: using k not num_probes_ | ||
// NB: using stored k not num_probes for whole_bits_per_key_ | ||
return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k, | ||
array); | ||
} | ||
|
@@ -504,9 +520,9 @@ FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilderInternal( | |
case kDeprecatedBlock: | ||
return nullptr; | ||
case kFastLocalBloom: | ||
return new FastLocalBloomBitsBuilder(bits_per_key_, num_probes_); | ||
return new FastLocalBloomBitsBuilder(millibits_per_key_); | ||
case kLegacyBloom: | ||
return new LegacyBloomBitsBuilder(bits_per_key_, num_probes_); | ||
return new LegacyBloomBitsBuilder(whole_bits_per_key_); | ||
} | ||
} | ||
assert(false); | ||
|
@@ -649,7 +665,7 @@ FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( | |
return new AlwaysTrueFilter(); | ||
} | ||
|
||
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, | ||
const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, | ||
bool use_block_based_builder) { | ||
BloomFilterPolicy::Mode m; | ||
if (use_block_based_builder) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any reason for this sanitizing? I know it doesn't make sense to use lower than 1.0. It doesn't seem to make sense to use 1.0 either. In that case, should we just leave it as it is?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
1.0 is OK if you consider a 63% FP rate better than nothing. I do.
The big thing is getting rid of values like negatives, infinities, or NaN that can gum up the conversion. And I can see these arising in client code due to subtle logical errors, etc. I think it's better to keep the value within a generous realm of "reasonable" than to always trust the client. Note: this plays into PR #6088 because that makes it more likely that client code might compute rather than hard-code bits_per_key values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If 1.0 is OK, then why 0.99 is not? 0.98 is not? I think it's more like a code readability problem. When 1.0 is hard coded here, readers might assume something special to it. I don't have objection though, as long as code comment explains it clearly.