tzaeschke · tzaeschke · Nov 22, 2022 · Nov 18, 2022 · Nov 18, 2022 · Nov 18, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,7 +10,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Improved performance of window queries by executing them partially as point queries.
   This works best for point datasets, and somewhat for box datasets with "include" queries.
   There is no benefit for "intersection" queries. [#88](https://github.com/tzaeschke/phtree-cpp/issues/88)
-- Improved benchmarks for insert and query to use a more compact format. [91](https://github.com/tzaeschke/phtree-cpp/pull/91)
+- Improved benchmarks for insert and query to use a more compact format. [#91](https://github.com/tzaeschke/phtree-cpp/pull/91)
+- Improved performance of window queries by optimizing calculation of min/max masks.
+  Improved performance of queries and updates by changing bit-width of min/max masks and
+  hc_pos_t. [#95](https://github.com/tzaeschke/phtree-cpp/pull/95)
 
 ### Removed
 - bazel version requirement file `.bazelversion`. [#89](https://github.com/tzaeschke/phtree-cpp/issues/89)

diff --git a/benchmark/hd_query_d_benchmark.cc b/benchmark/hd_query_d_benchmark.cc
@@ -172,6 +172,18 @@ void PhTree6D_FE(benchmark::State& state, Arguments&&...) {
     benchmark.Benchmark(state);
 }
 
+template <typename... Arguments>
+void PhTree10D_FE(benchmark::State& state, Arguments&&...) {
+    IndexBenchmark<10, MIN_MAX_FOR_EACH> benchmark{state};
+    benchmark.Benchmark(state);
+}
+
+template <typename... Arguments>
+void PhTree20D_FE(benchmark::State& state, Arguments&&...) {
+    IndexBenchmark<20, MIN_MAX_FOR_EACH> benchmark{state};
+    benchmark.Benchmark(state);
+}
+
 template <typename... Arguments>
 void PhTree6D_IT(benchmark::State& state, Arguments&&...) {
     IndexBenchmark<6, MIN_MAX_ITER> benchmark{state};
@@ -196,6 +208,16 @@ BENCHMARK_CAPTURE(PhTree6D_FE, WQ, 0)
     ->Ranges({{1000, 1000 * 1000}, {TestGenerator::CLUSTER, TestGenerator::CUBE}})
     ->Unit(benchmark::kMillisecond);
 
+BENCHMARK_CAPTURE(PhTree10D_FE, WQ, 0)
+    ->RangeMultiplier(10)
+    ->Ranges({{1000, 1000 * 1000}, {TestGenerator::CLUSTER, TestGenerator::CUBE}})
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_CAPTURE(PhTree20D_FE, WQ, 0)
+    ->RangeMultiplier(10)
+    ->Ranges({{1000, 1000 * 1000}, {TestGenerator::CLUSTER, TestGenerator::CUBE}})
+    ->Unit(benchmark::kMillisecond);
+
 BENCHMARK_CAPTURE(PhTree6D_IT, WQ, 0)
     ->RangeMultiplier(10)
     ->Ranges({{1000, 1000 * 1000}, {TestGenerator::CLUSTER, TestGenerator::CUBE}})

diff --git a/include/phtree/common/base_types.h b/include/phtree/common/base_types.h
@@ -55,7 +55,15 @@ using bit_mask_t = typename std::make_unsigned<SCALAR>::type;
 template <typename SCALAR>
 static constexpr bit_mask_t<SCALAR> MAX_MASK = std::numeric_limits<bit_mask_t<SCALAR>>::max();
 using dimension_t = size_t;  // Number of dimensions
-using hc_pos_t = uint64_t;
+// We have two types that represent hypercube addresses (HC position).
+// The hc_pos_dim_t uses a template parameter to determine how many bits are needed, this is either
+// 32bit or 64bit. This parameter is used where HC positions are stored because benchmarks show a
+// difference in performance when this is used.
+// The hc_pos_64_t type is always set to 64. It is used where computations play a role that appear
+// to prefer being in always 64bit, mainly in CalcPosInArray() and in Node.
+template<dimension_t DIM>
+using hc_pos_dim_t = std::conditional_t<(DIM < 32), uint32_t, uint64_t>;
+using hc_pos_64_t = uint64_t;
 
 // ************************************************************************
 // Basic structs and classes

diff --git a/include/phtree/common/common.h b/include/phtree/common/common.h
@@ -49,7 +49,7 @@ namespace improbable::phtree {
  * an array.
  */
 template <dimension_t DIM, typename SCALAR>
-static hc_pos_t CalcPosInArray(const PhPoint<DIM, SCALAR>& valSet, bit_width_t postfix_len) {
+static hc_pos_64_t CalcPosInArray(const PhPoint<DIM, SCALAR>& valSet, bit_width_t postfix_len) {
     // n=DIM,  i={0..n-1}
     // i = 0 :  |0|1|0|1|0|1|0|1|
     // i = 1 :  | 0 | 1 | 0 | 1 |
@@ -58,7 +58,7 @@ static hc_pos_t CalcPosInArray(const PhPoint<DIM, SCALAR>& valSet, bit_width_t p
     // Following formula was for inverse ordering of current ordering...
     // pos = sum (i=1..n, len/2^i) = sum (..., 2^(n-i))
     bit_mask_t<SCALAR> valMask = bit_mask_t<SCALAR>(1) << postfix_len;
-    hc_pos_t pos = 0;
+    hc_pos_64_t pos = 0;
     for (dimension_t i = 0; i < DIM; ++i) {
         pos <<= 1;
         // set pos-bit if bit is set in value

diff --git a/include/phtree/common/flat_sparse_map.h b/include/phtree/common/flat_sparse_map.h
@@ -30,52 +30,46 @@
  */
 namespace improbable::phtree {
 
-namespace {
-template <typename T>
-using PhSparseMapPair = std::pair<size_t, T>;
-
-using index_t = std::int32_t;
-}  // namespace
-
 /*
  * The sparse_map is a flat map implementation that uses an array of *at* *most* SIZE=2^DIM.
  * The array contains a list sorted by key.
  *
  * It has O(log n) lookup and O(n) insertion/removal time complexity, space complexity is O(n).
  */
-template <typename T>
+template <typename KeyT, typename ValueT>
 class sparse_map {
+    using Entry = std::pair<KeyT, ValueT>;
+
   public:
     explicit sparse_map() : data_{} {
         data_.reserve(4);
     }
 
-    [[nodiscard]] auto find(size_t key) {
+    [[nodiscard]] auto find(KeyT key) {
         auto it = lower_bound(key);
         if (it != data_.end() && it->first == key) {
             return it;
         }
         return data_.end();
     }
 
-    [[nodiscard]] auto find(size_t key) const {
+    [[nodiscard]] auto find(KeyT key) const {
         auto it = lower_bound(key);
         if (it != data_.end() && it->first == key) {
             return it;
         }
         return data_.end();
     }
 
-    [[nodiscard]] auto lower_bound(size_t key) {
-        return std::lower_bound(
-            data_.begin(), data_.end(), key, [](PhSparseMapPair<T>& left, const size_t key) {
-                return left.first < key;
-            });
+    [[nodiscard]] auto lower_bound(KeyT key) {
+        return std::lower_bound(data_.begin(), data_.end(), key, [](Entry& left, const KeyT key) {
+            return left.first < key;
+        });
     }
 
-    [[nodiscard]] auto lower_bound(size_t key) const {
+    [[nodiscard]] auto lower_bound(KeyT key) const {
         return std::lower_bound(
-            data_.cbegin(), data_.cend(), key, [](const PhSparseMapPair<T>& left, const size_t key) {
+            data_.cbegin(), data_.cend(), key, [](const Entry& left, const KeyT key) {
                 return left.first < key;
             });
     }
@@ -110,14 +104,14 @@ class sparse_map {
         return try_emplace_base(key, std::forward<Args>(args)...);
     }
 
-    void erase(size_t key) {
+    void erase(KeyT key) {
         auto it = lower_bound(key);
         if (it != end() && it->first == key) {
             data_.erase(it);
         }
     }
 
-    void erase(const typename std::vector<PhSparseMapPair<T>>::iterator& iterator) {
+    void erase(const typename std::vector<Entry>::iterator& iterator) {
         data_.erase(iterator);
     }
 
@@ -127,7 +121,7 @@ class sparse_map {
 
   private:
     template <typename... Args>
-    auto emplace_base(size_t key, Args&&... args) {
+    auto emplace_base(KeyT key, Args&&... args) {
         auto it = lower_bound(key);
         if (it != end() && it->first == key) {
             return std::make_pair(it, false);
@@ -137,7 +131,7 @@ class sparse_map {
     }
 
     template <typename... Args>
-    auto try_emplace_base(size_t key, Args&&... args) {
+    auto try_emplace_base(KeyT key, Args&&... args) {
         auto it = lower_bound(key);
         if (it != end() && it->first == key) {
             return std::make_pair(it, false);
@@ -151,7 +145,7 @@ class sparse_map {
         }
     }
 
-    std::vector<PhSparseMapPair<T>> data_;
+    std::vector<Entry> data_;
 };
 
 }  // namespace improbable::phtree

diff --git a/include/phtree/v16/debug_helper_v16.h b/include/phtree/v16/debug_helper_v16.h
@@ -17,9 +17,9 @@
 #ifndef PHTREE_V16_DEBUG_HELPER_H
 #define PHTREE_V16_DEBUG_HELPER_H
 
+#include "node.h"
 #include "phtree/common/common.h"
 #include "phtree/common/debug_helper.h"
-#include "node.h"
 #include "phtree_v16.h"
 #include <string>
 
@@ -99,7 +99,7 @@ class DebugHelperV16 : public PhTreeDebugHelper::DebugHelper {
         bit_width_t current_depth,
         const EntryT& entry,
         const bit_width_t parent_postfix_len,
-        bool printValue) const {
+        bool print_value) const {
         std::string ind = "*";
         for (bit_width_t i = 0; i < current_depth; ++i) {
             ind += "-";
@@ -127,15 +127,15 @@ class DebugHelperV16 : public PhTreeDebugHelper::DebugHelper {
         // To clean previous postfixes.
         for (auto& it : node.Entries()) {
             const auto& child = it.second;
-            hc_pos_t hcPos = it.first;
+            auto hc_pos = it.first;
             if (child.IsNode()) {
-                sb << ind << "# " << hcPos << "  Node: " << std::endl;
-                ToStringTree(sb, current_depth + 1, child, postfix_len, printValue);
+                sb << ind << "# " << hc_pos << "  Node: " << std::endl;
+                ToStringTree(sb, current_depth + 1, child, postfix_len, print_value);
             } else {
                 // post-fix
                 sb << ind << ToBinary(child.GetKey());
-                sb << "  hcPos=" << hcPos;
-                if (printValue) {
+                sb << "  hcPos=" << hc_pos;
+                if (print_value) {
                     sb << "  v=" << (child.IsValue() ? "T" : "null");
                 }
                 sb << std::endl;

diff --git a/include/phtree/v16/entry.h b/include/phtree/v16/entry.h
@@ -125,6 +125,26 @@ class Entry {
         DestroyUnion();
     }
 
+    void SetNodeCenter() {
+        // The node center is defined as the prefix + a '1' bit after the prefix. The remaining
+        // bits, i.e. all post_len bits must be '0'.
+        // This is required for window queries which would otherwise need to calculate the
+        // center each time they traverse a node.
+        assert(union_type_ == NODE);
+        bit_mask_t<SCALAR> maskHcBit = bit_mask_t<SCALAR>(1) << postfix_len_;
+        bit_mask_t<SCALAR> maskVT = MAX_MASK<SCALAR> << postfix_len_;
+        // to prevent problems with signed long when using 64 bit
+        if (postfix_len_ < MAX_BIT_WIDTH<SCALAR> - 1) {
+            for (dimension_t i = 0; i < DIM; ++i) {
+                kd_key_[i] = (kd_key_[i] | maskHcBit) & maskVT;
+            }
+        } else {
+            for (dimension_t i = 0; i < DIM; ++i) {
+                kd_key_[i] = 0;
+            }
+        }
+    }
+
     [[nodiscard]] const KeyT& GetKey() const {
         return kd_key_;
     }
@@ -148,6 +168,7 @@ class Entry {
     }
 
     void SetKey(const KeyT& key) noexcept {
+        assert(union_type_ == VALUE);  // Do we have any other use?
         kd_key_ = key;
     }
 
@@ -157,6 +178,7 @@ class Entry {
         union_type_ = NODE;
         new (&node_) std::unique_ptr<NodeT>{std::move(node)};
         assert(!node);
+        SetNodeCenter();
     }
 
     [[nodiscard]] bit_width_t GetNodePostfixLen() const noexcept {
@@ -194,6 +216,9 @@ class Entry {
         union_type_ = EMPTY;
         *this = std::move(other);
         node.reset();
+        if (IsNode()) {
+            SetNodeCenter();
+        }
     }
 
   private:

diff --git a/include/phtree/v16/for_each_hc.h b/include/phtree/v16/for_each_hc.h
@@ -40,6 +40,7 @@ class ForEachHC {
     using KeyInternal = typename CONVERT::KeyInternal;
     using SCALAR = typename CONVERT::ScalarInternal;
     using EntryT = Entry<DIM, T, SCALAR>;
+    using hc_pos_t = hc_pos_dim_t<DIM>;
 
   public:
     template <typename CB, typename F>
@@ -64,7 +65,6 @@ class ForEachHC {
         auto postfix_len = entry.GetNodePostfixLen();
         auto end = entries.end();
         auto iter = opt_it != nullptr && *opt_it != end ? *opt_it : entries.lower_bound(mask_lower);
-        //auto iter = opt_it != nullptr ? *opt_it : entries.lower_bound(mask_lower);
         for (; iter != end && iter->first <= mask_upper; ++iter) {
             auto child_hc_pos = iter->first;
             // Use bit-mask magic to check whether we are in a valid quadrant.
@@ -94,18 +94,17 @@ class ForEachHC {
         // An infix with len=0 implies that at least part of the child node overlaps with the query,
         // otherwise the bit mask checking would have returned 'false'.
         // Putting it differently, if the infix has len=0, then there is no point in validating it.
+        bool mismatch = false;
         if (entry.HasNodeInfix(parent_postfix_len)) {
             // Mask for comparing the prefix with the query boundaries.
             assert(entry.GetNodePostfixLen() + 1 < MAX_BIT_WIDTH<SCALAR>);
             SCALAR comparison_mask = MAX_MASK<SCALAR> << (entry.GetNodePostfixLen() + 1);
             for (dimension_t dim = 0; dim < DIM; ++dim) {
                 SCALAR prefix = key[dim] & comparison_mask;
-                if (prefix > range_max_[dim] || prefix < (range_min_[dim] & comparison_mask)) {
-                    return false;
-                }
+                mismatch |= (prefix > range_max_[dim] || prefix < (range_min_[dim] & comparison_mask));
             }
         }
-        return filter_.IsNodeValid(key, entry.GetNodePostfixLen() + 1);
+        return mismatch ? false : filter_.IsNodeValid(key, entry.GetNodePostfixLen() + 1);
     }
 
     void CalcLimits(
@@ -127,23 +126,17 @@ class ForEachHC {
         // query higher ||                                       NO                  YES
         //
         assert(postfix_len < MAX_BIT_WIDTH<SCALAR>);
-        bit_mask_t<SCALAR> maskHcBit = bit_mask_t<SCALAR>(1) << postfix_len;
-        bit_mask_t<SCALAR> maskVT = MAX_MASK<SCALAR> << postfix_len;
-        constexpr hc_pos_t ONE = 1;
         // to prevent problems with signed long when using 64 bit
         if (postfix_len < MAX_BIT_WIDTH<SCALAR> - 1) {
             for (dimension_t i = 0; i < DIM; ++i) {
                 lower_limit <<= 1;
+                //==> set to 1 if lower value should not be queried
+                lower_limit |= range_min_[i] >= prefix[i];
+            }
+            for (dimension_t i = 0; i < DIM; ++i) {
                 upper_limit <<= 1;
-                SCALAR nodeBisection = (prefix[i] | maskHcBit) & maskVT;
-                if (range_min_[i] >= nodeBisection) {
-                    //==> set to 1 if lower value should not be queried
-                    lower_limit |= ONE;
-                }
-                if (range_max_[i] >= nodeBisection) {
-                    // Leave 0 if higher value should not be queried.
-                    upper_limit |= ONE;
-                }
+                // Leave 0 if higher value should not be queried.
+                upper_limit |= range_max_[i] >= prefix[i];
             }
         } else {
             // special treatment for signed longs
@@ -152,20 +145,18 @@ class ForEachHC {
             // The hypercube assumes that a leading '0' indicates a lower value.
             // Solution: We leave HC as it is.
             for (dimension_t i = 0; i < DIM; ++i) {
-                lower_limit <<= 1;
                 upper_limit <<= 1;
-                if (range_min_[i] < 0) {
-                    // If minimum is positive, we don't need the search negative values
-                    //==> set upper_limit to 0, prevent searching values starting with '1'.
-                    upper_limit |= ONE;
-                }
-                if (range_max_[i] < 0) {
-                    // Leave 0 if higher value should not be queried
-                    // If maximum is negative, we do not need to search positive values
-                    //(starting with '0').
-                    //--> lower_limit = '1'
-                    lower_limit |= ONE;
-                }
+                // If minimum is positive, we don't need the search negative values
+                //==> set upper_limit to 0, prevent searching values starting with '1'.
+                upper_limit |= range_min_[i] < 0;
+            }
+            for (dimension_t i = 0; i < DIM; ++i) {
+                lower_limit <<= 1;
+                // Leave 0 if higher value should not be queried
+                // If maximum is negative, we do not need to search positive values
+                //(starting with '0').
+                //--> lower_limit = '1'
+                lower_limit |= range_max_[i] < 0;
             }
         }
     }