Skip to content

Commit

Permalink
Improve hash implementation to prevent unnecessary collisions.
Browse files Browse the repository at this point in the history
Test number of collisions is small.
  • Loading branch information
helly25 committed May 6, 2024
1 parent 873759c commit 16dd47a
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 6 deletions.
8 changes: 4 additions & 4 deletions mbo/hash/hash_simple.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ inline constexpr uint64_t GetSimpleHash(std::string_view data) {
// [-Werror=tautological-compare]
if (std::is_constant_evaluated()) {
std::size_t len = data.length();
uint64_t result = kArbitrary;
uint64_t result = kArbitrary + len;
std::size_t pos = 0;
while (len >= 4) {
uint64_t add = 0;
add += static_cast<uint64_t>(data[pos++]);
add += static_cast<uint64_t>(data[pos++]) << 8U;
add += static_cast<uint64_t>(data[pos++]) << 16U;
add += static_cast<uint64_t>(data[pos++]) << 24U;
result = result * 6'571 + (17 * add + (add >> 16U));
result = result * 6'571 ^ ((17 * add + (add >> 16U)) ^ (result >> 32U));
len -= 4;
}
if (len == 3) {
Expand All @@ -75,11 +75,11 @@ inline constexpr uint64_t GetSimpleHash(std::string_view data) {
} else {
const char* str = data.data();
const char* end = data.end() - 3;
uint64_t result = kArbitrary;
uint64_t result = kArbitrary + data.length();
uint64_t add = 0;
while (str < end) {
std::memcpy(&add, str, 4);
result = result * 6'571 + (17 * add + (add >> 16U));
result = result * 6'571 ^ ((17 * add + (add >> 16U)) ^ (result >> 32U));
str += 4; // NOLINT(*-pointer-arithmetic)
}
switch (data.length() % 4) { // NOLINT(*-default-case)
Expand Down
69 changes: 67 additions & 2 deletions mbo/hash/hash_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include "mbo/hash/hash.h"

#include <array>
#include <cstddef>
#include <random>
#include <string_view>

#include "gmock/gmock.h"
Expand All @@ -26,7 +28,9 @@
namespace mbo::hash::simple {
namespace {

using ::testing::Le;
using ::testing::Ne;
using ::testing::SizeIs;

struct HashTest : ::testing::Test {};

Expand Down Expand Up @@ -56,14 +60,75 @@ TEST_F(HashTest, Test) {
for (std::size_t n = 0; n < kData.size(); ++n) {
// Force using non constexpr `GetHash` by providing a non const string_view variable.
sv = kData[n];
EXPECT_THAT(GetHash(sv), kHashes[n]) << "Using the non const `string_view sv` to compute a hash should result the "
"same value as the compile-time computed hash.";
EXPECT_THAT(GetHash(sv), kHashes[n]) << "Using the non const `string_view sv` to compute a hash should result in "
"the same value as the compile-time computed hash.";
EXPECT_THAT(GetHash(sv), Ne(0));
EXPECT_THAT(GetHash(sv), Ne(-1));
EXPECT_THAT(GetHash(sv), Ne(~0ULL));
}
}

class RandomStringGenerator {
public:
static inline constexpr std::size_t kDefaultMaxLen = 80;

explicit RandomStringGenerator(int seed) : mt_(seed), char_dist_(0, kMaxChar) {}

std::string GetRandomString(std::size_t max_len = kDefaultMaxLen) const {
std::uniform_int_distribution<> str_len_dist(0, static_cast<int>(max_len));

size_t length{0};
while ((length = str_len_dist(mt_)) > max_len) {};
std::string str;
str.resize(length);
for (size_t n = 0; n < length; ++n) {
str[n] = static_cast<char>(char_dist_(mt_));
}
return str;
}

// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
std::set<std::string> GetRandomStringSet(std::size_t num_strings, std::size_t max_len = kDefaultMaxLen) const {
std::set<std::string> result;
while (result.size() < num_strings) {
result.insert(GetRandomString(max_len));
}
return result;
}

private:
static inline constexpr int kMaxChar = 255; // Could exclude 255 (invalid UTF-8).
mutable std::mt19937 mt_;
mutable std::uniform_int_distribution<int> char_dist_;
};

#undef SANITIZER
// Clang
#if defined(__has_feature)
# if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
# define SANITIZER
# endif
// GCC
#elif defined(__SANITIZE_ADDRESS__)
# define SANITIZER
#endif

TEST_F(HashTest, Collision) {
const RandomStringGenerator rsg(::testing::UnitTest::GetInstance()->random_seed());
#ifdef SANITIZER
constexpr std::size_t kNumStrings = 200'000;
#else // SANITIZER
constexpr std::size_t kNumStrings = 2'000'000;
#endif // SANITIZER
const std::set<std::string> strings = rsg.GetRandomStringSet(kNumStrings, 30);
std::set<uint64_t> hashes;
for (const auto& str : strings) {
hashes.insert(GetHash(str));
}
constexpr int kNumMaxCollisions = kNumStrings / 2'000'000;
EXPECT_THAT(strings.size() - hashes.size(), Le(kNumMaxCollisions));
}

} // namespace
} // namespace mbo::hash::simple

Expand Down

0 comments on commit 16dd47a

Please sign in to comment.