diff --git a/.bazelrc b/.bazelrc
index e0af0f7d..0b4fe182 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -26,6 +26,8 @@ build:ci --announce_rc
 
 #build:linux --copt="-O1"
 #build:linux --copt="-march=skylake"
+#build:linux --copt="-march=haswell"
+#build:linux --copt="-march=native"
 build:linux --copt="-fvisibility=hidden"
 build:linux --copt="-fno-omit-frame-pointer"  # for friendlier stack traces
 build:linux --copt="-Wno-error"
@@ -34,6 +36,8 @@ build:linux --copt="-Wextra"
 build:linux --copt="-Werror=return-type"
 build:linux --copt="-Werror=switch"
 build:linux --copt="-mavx"
+# Enable CLZ (count leading zeros). This is equivalent to "-march=haswell"
+build:linux --copt="-mbmi2"
 build:linux --copt="-Wsequence-point"
 build:linux --copt="-Wsign-compare"
 build:linux --cxxopt="-std=c++17"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e7ea9ffd..ceb6d45a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Added B+tree multimap for internal (future) use. [#93](https://github.com/tzaeschke/phtree-cpp/issues/93)
 
 ### Changed
+- Added missing compiler flag for TZCNT/CTZ (count trailing zeros). This should be much faster on haswell or later CPUs.
+  [#103](https://github.com/tzaeschke/phtree-cpp/issues/103),
 - Rewrote relocate(). This should be much cleaner now and slightly faster. 
   [#98](https://github.com/tzaeschke/phtree-cpp/pull/98), 
   [#99](https://github.com/tzaeschke/phtree-cpp/pull/99),
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac6c17af..55413157 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,9 +79,11 @@ if (MSVC)
 else ()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror")
     if (PHTREE_BUILD_BENCHMARKS)
-        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -mavx -pthread")
+        # Enable vectorization and TZCNT/CTZ
+        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -mavx -mbmi2 -pthread")
     else ()
-        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -mavx")
+        # Enable vectorization and TZCNT/CTZ
+        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -mavx -mbmi2 ")
     endif ()
     if (PHTREE_CODE_COVERAGE)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage") # -Wa,-mbig-obj")
diff --git a/README.md b/README.md
index 4f29309d..f2b1d329 100644
--- a/README.md
+++ b/README.md
@@ -476,9 +476,12 @@ heavily on the actual dataset, usage patterns, hardware, ... .
 
 There are numerous ways to improve performance. The following list gives an overview over the possibilities.
 
-1) **Use `for_each` instead of iterators**. This should improve performance of queries by 10%-20%.
+1) **Use `-O3 -mavx, -mbmi2` compiler flags**. Ensure that vectorization and count trailing zeros (CTZ/TZCNT) are 
+   enabled. 
 
-2) **Use `relocate()` / `relocate_if()` if possible**. When updating the position of an entry, the naive way is 
+2) **Use `for_each` instead of iterators**. This should improve performance of queries by 10%-20%.
+
+3) **Use `relocate()` / `relocate_if()` if possible**. When updating the position of an entry, the naive way is 
    to use `erase()` / `emplace()`. With `relocate` / `relocate_if()`, insertion can avoid a lot of duplicate 
    navigation in the tree if the new coordinate is close to the old coordinate.
    ```c++
@@ -490,19 +493,19 @@ There are numerous ways to improve performance. The following list gives an over
    relocate(old_position, new_position, value);
    ```
 
-3) **Store pointers instead of large data objects**. For example, use `PhTree<3, MyLargeClass*>` instead of
+4) **Store pointers instead of large data objects**. For example, use `PhTree<3, MyLargeClass*>` instead of
    `PhTree<3, MyLargeClass>` if `MyLargeClass` is large.
     * This prevents the PH-Tree from storing the values inside the tree. This should improve cache-locality and thus
       performance when operating on the tree.
     * Using pointers is also useful if construction/destruction of values is expensive. The reason is that the tree has
       to construct and destruct objects internally. This may be avoidable but is currently still happening.
 
-4) **Use non-box query shapes**. Depending on the use case it may be more suitable to use a custom filter for queries.
+5) **Use non-box query shapes**. Depending on the use case it may be more suitable to use a custom filter for queries.
    For example:
 
    `tree.for_each(callback, FilterSphere(center, radius, tree.converter()));`
 
-5) **Use a different data converter**. The default converter of the PH-Tree results in a reasonably fast index. Its
+6) **Use a different data converter**. The default converter of the PH-Tree results in a reasonably fast index. Its
    biggest advantage is that it provides lossless conversion from floating point coordinates to PH-Tree coordinates
    (integers) and back to floating point coordinates.
     * The `ConverterMultiply` is a lossy converter but it tends to improve performance by 10% or more. This is not
@@ -511,16 +514,16 @@ There are numerous ways to improve performance. The following list gives an over
 
       `PhTreeD<DIM, T, ConverterMultiply<3, 100 * 1000, 1>>()`
 
-6) **Use custom key types**. By default, the PH-Tree accepts only coordinates in the form of its own key types, such
+7) **Use custom key types**. By default, the PH-Tree accepts only coordinates in the form of its own key types, such
    as `PhPointD`, `PhBoxF` or similar. To avoid conversion from custom types to PH-Tree key types, custom classes can
    often be adapted to be accepted directly by the PH-Tree without conversion. This requires implementing a custom
    converter as described in the section about [Custom Key Types](#custom-key-types).
 
-7) Advanced: **Adapt internal Node representation**. Depending on the dimensionality `DIM`, the PH-Tree uses internally
-   in
-   `Nodes` different container types to hold entries. By default, it uses an array for `DIM<=3`, a vector for `DIM<=8`
-   and an ordered map for `DIM>8`. Adapting these thresholds can have strong effects on performance as well as memory
-   usage. One example: Changing the threshold to use vector for `DIM==3` reduced performance of the `update_d` benchmark
+8) Advanced: **Adapt internal Node representation**. Depending on the dimensionality `DIM`, the PH-Tree uses 
+   internally in `Nodes` different container types to hold entries. 
+   By default, it uses an array for `DIM<=3`, a vector for `DIM<=8` and an ordered map for `DIM>8`. 
+   Adapting these thresholds can have strong effects on performance as well as memory usage. 
+   One example: Changing the threshold to use vector for `DIM==3` reduced performance of the `update_d` benchmark
    by 40%-50% but improved performance of `query_d` by 15%-20%. The threshold is currently hardcoded.     
    The effects are not always easy to predict but here are some guidelines:
     * "array" is the fastest solution for insert/update/remove type operations. Query performance is "ok". Memory
@@ -547,9 +550,10 @@ PH-Tree can be built with [Bazel](https://bazel.build) (primary build system) or
 All code is written in C++ targeting the C++17 standard. 
 The code has been verified to compile on Linux with Clang 11 and GCC 9, and on Windows with Visual Studio 2019
 (except benchmarks, which don't work with VS).
-The PH-tree makes use of vectorization, so suggested compilation options for clang/gcc are:
+The PH-tree makes use of vectorization and CountTrailingZeros/CTZ/TZCNT, so suggested compilation options for 
+clang/gcc are:
 ```
--O3 -mavx
+-O3 -mavx -mbmi2
 ```