kuzudb · ray6080 · Feb 27, 2024 · Feb 26, 2024 · ray6080 · Feb 27, 2024
diff --git a/src/include/function/hash/hash_functions.h b/src/include/function/hash/hash_functions.h
@@ -9,6 +9,7 @@
 #include "common/types/int128_t.h"
 #include "common/types/interval_t.h"
 #include "common/types/ku_string.h"
+#include "common/types/types.h"
 #include "common/vector/value_vector.h"
 
 namespace kuzu {
@@ -133,31 +134,52 @@
 template<>
 inline void Hash::operation(
     const double& key, common::hash_t& result, common::ValueVector* /*keyVector*/) {
-    result = std::hash<double>()(key);
+    // 0 and -0 are not byte-equivalent, but should have the same hash
+    if (key == 0) {
+        result = murmurhash64(0);
+    } else {
+        result = murmurhash64(*reinterpret_cast<const uint64_t*>(&key));
+    }
 }
 
 template<>
 inline void Hash::operation(
     const float& key, common::hash_t& result, common::ValueVector* /*keyVector*/) {
-    result = std::hash<float>()(key);
+    // 0 and -0 are not byte-equivalent, but should have the same hash
+    if (key == 0) {
+        result = murmurhash64(0);
+    } else {
+        result = murmurhash64(*reinterpret_cast<const uint32_t*>(&key));
+    }
 }
 
 template<>
 inline void Hash::operation(
-    const std::string& key, common::hash_t& result, common::ValueVector* /*keyVector*/) {
-    result = std::hash<std::string>()(key);
+    const std::string_view& key, common::hash_t& result, common::ValueVector* /*keyVector*/) {
+    common::hash_t hashValue = 0;
+    auto data64 = reinterpret_cast<const uint64_t*>(key.data());
+    for (size_t i = 0u; i < key.size() / 8; i++) {
+        auto blockHash = kuzu::function::murmurhash64(*(data64 + i));
+        hashValue = kuzu::function::combineHashScalar(hashValue, blockHash);
+    }
+    uint64_t last = 0;
+    for (size_t i = 0u; i < key.size() % 8; i++) {
+        last |= key[key.size() / 8 * 8 + i] << i * 8;
+    }
+    hashValue = kuzu::function::combineHashScalar(hashValue, kuzu::function::murmurhash64(last));
+    result = hashValue;
 }
 
 template<>
 inline void Hash::operation(
-    const std::string_view& key, common::hash_t& result, common::ValueVector* /*keyVector*/) {
-    result = std::hash<std::string_view>()(key);
+    const std::string& key, common::hash_t& result, common::ValueVector* /*keyVector*/) {
+    Hash::operation(std::string_view(key), result);
 }
 
 template<>
 inline void Hash::operation(
     const common::ku_string_t& key, common::hash_t& result, common::ValueVector* /*keyVector*/) {
-    result = std::hash<std::string_view>()(key.getAsStringView());
+    Hash::operation(key.getAsStringView(), result);
 }
 
 template<>

diff --git a/src/include/processor/operator/persistent/index_builder.h b/src/include/processor/operator/persistent/index_builder.h
@@ -66,18 +66,18 @@ class IndexBuilderLocalBuffers {
     explicit IndexBuilderLocalBuffers(IndexBuilderGlobalQueues& globalQueues);
 
     void insert(std::string key, common::offset_t value) {
-        auto indexPos = storage::getHashIndexPosition(key.c_str());
+        auto indexPos = storage::HashIndexUtils::getHashIndexPosition(std::string_view(key));
         auto& stringBuffer = (*std::get<UniqueBuffers<std::string>>(buffers))[indexPos];
         if (stringBuffer.full()) {
-            // StaticVector's move constructor leavse the original vector valid and empty
+            // StaticVector's move constructor leaves the original vector valid and empty
             globalQueues->insert(indexPos, std::move(stringBuffer));
         }
         stringBuffer.push_back(std::make_pair(key, value)); // NOLINT(bugprone-use-after-move)
     }
 
     template<common::HashablePrimitive T>
     void insert(T key, common::offset_t value) {
-        auto indexPos = storage::getHashIndexPosition(key);
+        auto indexPos = storage::HashIndexUtils::getHashIndexPosition(key);
         auto& buffer = (*std::get<UniqueBuffers<T>>(buffers))[indexPos];
         if (buffer.full()) {
             globalQueues->insert(indexPos, std::move(buffer));

@@ -207,7 +207,7 @@ class PrimaryKeyIndex {
     template<typename T, typename S = T>
     inline HashIndex<T, S>* getTypedHashIndex(T key) {
         return common::ku_dynamic_cast<OnDiskHashIndex*, HashIndex<T, S>*>(
-            hashIndices[getHashIndexPosition(key)].get());
+            hashIndices[HashIndexUtils::getHashIndexPosition(key)].get());
     }
 
     inline bool lookup(

@@ -138,31 +138,32 @@
     // enough space already.
     template<common::HashablePrimitive T>
     bool append(T key, common::offset_t value) {
-        return appendWithIndexPos(key, value, getHashIndexPosition(key));
+        return appendWithIndexPos(key, value, HashIndexUtils::getHashIndexPosition(key));
     }
     bool append(std::string_view key, common::offset_t value) {
-        return appendWithIndexPos(key, value, getHashIndexPosition(key));
+        return appendWithIndexPos(key, value, HashIndexUtils::getHashIndexPosition(key));
     }
     template<common::HashablePrimitive T>
     bool appendWithIndexPos(T key, common::offset_t value, uint64_t indexPos) {
         KU_ASSERT(keyDataTypeID == common::TypeUtils::getPhysicalTypeIDForType<T>());
-        KU_ASSERT(getHashIndexPosition(key) == indexPos);
+        KU_ASSERT(HashIndexUtils::getHashIndexPosition(key) == indexPos);
         return getTypedHashIndex<T>(indexPos)->append(key, value);
     }
     bool appendWithIndexPos(std::string_view key, common::offset_t value, uint64_t indexPos) {
         KU_ASSERT(keyDataTypeID == common::PhysicalTypeID::STRING);
-        KU_ASSERT(getHashIndexPosition(key) == indexPos);
+        KU_ASSERT(HashIndexUtils::getHashIndexPosition(key) == indexPos);
         return getTypedHashIndex<std::string_view, common::ku_string_t>(indexPos)->append(
             key, value);
     }
     template<common::HashablePrimitive T>
     bool lookup(T key, common::offset_t& result) {
         KU_ASSERT(keyDataTypeID == common::TypeUtils::getPhysicalTypeIDForType<T>());
-        return getTypedHashIndex<T>(getHashIndexPosition(key))->lookup(key, result);
+        return getTypedHashIndex<T>(HashIndexUtils::getHashIndexPosition(key))->lookup(key, result);
     }
     bool lookup(std::string_view key, common::offset_t& result) {
         KU_ASSERT(keyDataTypeID == common::PhysicalTypeID::STRING);
-        return getTypedHashIndex<std::string_view, common::ku_string_t>(getHashIndexPosition(key))
+        return getTypedHashIndex<std::string_view, common::ku_string_t>(
+            HashIndexUtils::getHashIndexPosition(key))
             ->lookup(key, result);
     }
 

@@ -1,7 +1,5 @@
 #pragma once
 
-#include <functional>
-
 #include "common/types/ku_string.h"
 #include "common/types/types.h"
 #include "function/hash/hash_functions.h"
@@ -28,16 +26,6 @@ static constexpr common::page_idx_t O_SLOTS_HEADER_PAGE_IDX = 2;
 static constexpr common::page_idx_t NUM_HEADER_PAGES = 3;
 static constexpr uint64_t INDEX_HEADER_IDX_IN_ARRAY = 0;
 
-inline uint64_t getHashIndexPosition(common::HashablePrimitive auto key) {
-    common::hash_t hash;
-    function::Hash::operation(key, hash, nullptr /*keyVector*/);
-    return (hash >> (64 - NUM_HASH_INDEXES_LOG2)) & (NUM_HASH_INDEXES - 1);
-}
-inline uint64_t getHashIndexPosition(std::string_view key) {
-    return (std::hash<std::string_view>()(key) >> (64 - NUM_HASH_INDEXES_LOG2)) &
-           (NUM_HASH_INDEXES - 1);
-}
-
 enum class SlotType : uint8_t { PRIMARY = 0, OVF = 1 };
 
 struct SlotInfo {
@@ -77,6 +65,10 @@ class HashIndexUtils {
         return slotId;
     }
 
+    inline static uint64_t getHashIndexPosition(common::IndexHashable auto key) {
+        return (HashIndexUtils::hash(key) >> (64 - NUM_HASH_INDEXES_LOG2)) & (NUM_HASH_INDEXES - 1);
+    }
+
     static inline uint64_t getNumRequiredEntries(
         uint64_t numExistingEntries, uint64_t numNewEntries) {
         return ceil((double)(numExistingEntries + numNewEntries) * common::DEFAULT_HT_LOAD_FACTOR);

diff --git a/test/test_files/update_node/create_empty.test b/test/test_files/update_node/create_empty.test
@@ -353,6 +353,38 @@ foobar-ewe-j323-8nd*-ewew
 -STATEMENT MATCH (t:test) WHERE t.id = to_float(0.1) RETURN t.id;
 ---- 1
 0.100000
+-STATEMENT CREATE (t:test {id:0});
+---- ok
+# Zero and negative zero need to be equivalent
+-STATEMENT MATCH (t:test) WHERE t.id = (0.0 * -1) RETURN t.id;
+---- 1
+0.000000
+-STATEMENT MATCH (t:test) WHERE t.id = -0.0 RETURN t.id;
+---- 1
+0.000000
+-STATEMENT MATCH (t:test) WHERE t.id = to_float("-0.0") RETURN t.id;
+---- 1
+0.000000
+# infinity should work
+-STATEMENT CREATE (t:test {id:1.0/0.0});
+---- ok
+-STATEMENT MATCH (t:test) WHERE t.id = 1.0/0.0 RETURN t.id;
+---- 1
+inf
+-STATEMENT MATCH (t:test) WHERE t.id = to_float("inf") RETURN t.id;
+---- 1
+inf
+-STATEMENT MATCH (t:test) WHERE t.id = to_float("-inf") RETURN t.id;
+---- 0
+# NaN should not be searchable
+-STATEMENT CREATE (t:test {id:0.0/0.0});
+---- ok
+-STATEMENT MATCH (t:test) WHERE t.id = 0.0/0.0 RETURN t.id;
+---- 0
+-STATEMENT CREATE (t:test {id:sqrt(-1.0)});
+---- ok
+-STATEMENT MATCH (t:test) WHERE t.id = sqrt(-1.0) RETURN t.id;
+---- 0
 
 -CASE CreateBlobPK
 -STATEMENT CREATE NODE TABLE test(id BLOB, PRIMARY KEY(id));