From fba361a86366d78c3db9f63e177a52218dcdcd6f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 9 Jan 2020 15:54:49 +0100
Subject: [PATCH 1/8] add searching kernels and tests

---
 common/components/searching.hpp.inc   | 224 +++++++++++++++++++++++
 cuda/components/searching.cuh         |  55 ++++++
 cuda/test/components/CMakeLists.txt   |   1 +
 cuda/test/components/searching.cu     | 244 +++++++++++++++++++++++++
 hip/components/searching.hip.hpp      |  55 ++++++
 hip/test/components/CMakeLists.txt    |   1 +
 hip/test/components/searching.hip.cpp | 249 ++++++++++++++++++++++++++
 7 files changed, 829 insertions(+)
 create mode 100644 common/components/searching.hpp.inc
 create mode 100644 cuda/components/searching.cuh
 create mode 100644 cuda/test/components/searching.cu
 create mode 100644 hip/components/searching.hip.hpp
 create mode 100644 hip/test/components/searching.hip.cpp
diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc
new file mode 100644
index 00000000000..222ab85de33
--- /dev/null
+++ b/common/components/searching.hpp.inc
@@ -0,0 +1,224 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ * Generic binary search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * The implementation is based on Stepanov & McJones, "Elements of Programming".
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Predicate>
+__device__ IndexType binary_search(IndexType offset, IndexType length,
+                                   Predicate p)
+{
+    while (length > 0) {
+        auto half_length = length / 2;
+        auto mid = offset + half_length;
+        auto pred = p(mid);
+        length = pred ? half_length : length - (half_length + 1);
+        offset = pred ? offset : mid + 1;
+    }
+    return offset;
+}
+
+
+/**
+ * @internal
+ * Generic implementation of a fixed-size binary search.
+ * The implementation makes sure that the number of predicate evaluations only
+ * depends on `length` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ *
+ * @tparam size  the length of the partitioned range - must be a power of two
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `int` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <int size, typename Predicate>
+__device__ int synchronous_fixed_binary_search(Predicate p)
+{
+    if (size == 0) {
+        return 0;
+    }
+    auto begin = 0;
+    static_assert(size > 0, "size must be positive");
+    static_assert(!(size & (size - 1)), "size must be a power of two");
+#pragma unroll
+    for (auto cur_size = size; cur_size > 1; cur_size /= 2) {
+        auto half_size = cur_size / 2;
+        auto mid = begin + half_size;
+        // invariant: [begin, begin + cur_size] contains partition point
+        begin = p(mid) ? begin : mid;
+    }
+    // cur_size is now 1, so the partition point is either begin or begin + 1
+    return p(begin) ? begin : begin + 1;
+}
+
+
+/**
+ * @internal
+ * Generic implementation of a synchronous binary search.
+ * The implementation makes sure that the number of predicate evaluations only
+ * depends on `length` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ *
+ * @param size  the length of the partitioned range - must be a power of two
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `int` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename Predicate>
+__device__ int synchronous_binary_search(int size, Predicate p)
+{
+    if (size == 0) {
+        return 0;
+    }
+    auto begin = 0;
+    for (auto cur_size = size; cur_size > 1; cur_size /= 2) {
+        auto half_size = cur_size / 2;
+        auto mid = begin + half_size;
+        // invariant: [begin, begin + cur_size] contains partition point
+        begin = p(mid) ? begin : mid;
+    }
+    // cur_size is now 1, so the partition point is either begin or begin + 1
+    return p(begin) ? begin : begin + 1;
+}
+
+
+/**
+ * @internal
+ * Generic search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ *
+ * It executes `log2(length / group.size())` coalescing calls to `p`.
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param group   the coalescing group executing the search
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Group, typename Predicate>
+__device__ IndexType group_wide_search(IndexType offset, IndexType length,
+                                       Group group, Predicate p)
+{
+    // binary search on the group-sized blocks
+    IndexType num_blocks = (length + group.size() - 1) / group.size();
+    auto group_pos = binary_search(IndexType{}, num_blocks, [&](IndexType i) {
+        auto idx = i * group.size();
+        return p(offset + idx);
+    });
+    // case 1: p is true everywhere: middle is at the beginning
+    if (group_pos == 0) {
+        return offset;
+    }
+    /*
+     * case 2: p is false somewhere:
+     *
+     * p(group_pos * g.size()) is true, so either this is the partition point,
+     * or the partition point is one of the g.size() - 1 previous indices.
+     *   |block group_pos-1|
+     * 0 | 0 * * * * * * * | 1
+     *       ^               ^
+     *       we load this range, with the 1 acting as a sentinel for ffs(...)
+     *
+     * additionally, this means that we can't call p out-of-bounds
+     */
+    auto base_idx = (group_pos - 1) * group.size() + 1;
+    auto idx = base_idx + group.thread_rank();
+    auto pos = ffs(group.ballot(idx >= length || p(offset + idx))) - 1;
+    return offset + base_idx + pos;
+}
+
+
+/**
+ * @internal
+ * Generic search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ *
+ * It executes `log2(length) / log2(group.size())` calls to `p` that effectively
+ * follow a random-access pattern.
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param group   the coalescing group executing the search
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Group, typename Predicate>
+__device__ IndexType group_ary_search(IndexType offset, IndexType length,
+                                      Group group, Predicate p)
+{
+    IndexType end = offset + length;
+    // invariant: [offset, offset + length] contains middle
+    while (length > group.size()) {
+        auto stride = length / group.size();
+        auto idx = offset + group.thread_rank() * stride;
+        auto mask = group.ballot(p(idx));
+        // if the mask is 0, the partition point is in the last block
+        // if the mask is ~0, the partition point is in the first block
+        // otherwise, we go to the last block that returned a 0.
+        auto pos = mask == 0 ? group.size() - 1 : ffs(mask >> 1) - 1;
+        auto last_length = length - stride * (group.size() - 1);
+        length = pos == group.size() - 1 ? last_length : stride;
+        offset += stride * pos;
+    }
+    auto idx = offset + group.thread_rank();
+    // if the mask is 0, the partition point is at the end
+    // otherwise it is the first set bit
+    auto mask = group.ballot(idx >= end || p(idx));
+    auto pos = mask == 0 ? group.size() : ffs(mask) - 1;
+    return offset + pos;
+}
\ No newline at end of file
diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh
new file mode 100644
index 00000000000..4ebeceba720
--- /dev/null
+++ b/cuda/components/searching.cuh
@@ -0,0 +1,55 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_
+#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_
+
+
+#include "cuda/base/config.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/intrinsics.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+#include "common/components/searching.hpp.inc"
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_COMPONENTS_SEARCHING_CUH_
diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt
index a4110055031..da1781d80db 100644
--- a/cuda/test/components/CMakeLists.txt
+++ b/cuda/test/components/CMakeLists.txt
@@ -1,3 +1,4 @@
 ginkgo_create_cuda_test(cooperative_groups)
 ginkgo_create_cuda_test(prefix_sum)
+ginkgo_create_cuda_test(searching)
 ginkgo_create_cuda_test(sorting)
\ No newline at end of file
diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu
new file mode 100644
index 00000000000..da9f89f48ee
--- /dev/null
+++ b/cuda/test/components/searching.cu
@@ -0,0 +1,244 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/components/searching.cuh"
+
+
+#include <memory>
+#include <numeric>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/components/cooperative_groups.cuh"
+
+
+namespace {
+
+
+using namespace gko::kernels::cuda;
+using cooperative_groups::this_thread_block;
+
+
+class Searching : public ::testing::Test {
+protected:
+    Searching()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref)),
+          result(ref, 1),
+          dresult(cuda),
+          sizes(14203)
+    {
+        std::iota(sizes.begin(), sizes.end(), 0);
+    }
+
+    template <typename Kernel>
+    void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1)
+    {
+        *result.get_data() = true;
+        dresult = result;
+        kernel<<<num_blocks, config::warp_size>>>(dresult.get_data(), offset,
+                                                  size);
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    gko::Array<bool> result;
+    gko::Array<bool> dresult;
+    std::vector<int> sizes;
+};
+
+
+__device__ void test_assert(bool *success, bool predicate)
+{
+    if (!predicate) {
+        *success = false;
+    }
+}
+
+
+__global__ void test_binary_search(bool *success, int offset, int size)
+{
+    // test binary search on [0, size)
+    // for all possible partition points
+    auto result = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset;
+    });
+    auto result2 = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset + 1;
+    });
+    test_assert(success, result == threadIdx.x + offset);
+    test_assert(success, result2 == threadIdx.x + offset + 1);
+}
+
+TEST_F(Searching, BinaryNoOffset)
+{
+    run_test(test_binary_search, 0, config::warp_size);
+}
+
+TEST_F(Searching, BinaryOffset)
+{
+    run_test(test_binary_search, 5, config::warp_size);
+}
+
+
+__global__ void test_empty_binary_search(bool *success, int offset, int)
+{
+    auto result = binary_search(offset, 0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == offset);
+}
+
+TEST_F(Searching, BinaryEmptyNoOffset)
+{
+    run_test(test_empty_binary_search, 0, 0);
+}
+
+TEST_F(Searching, BinaryEmptyOffset)
+{
+    run_test(test_empty_binary_search, 5, 0);
+}
+
+
+__global__ void test_sync_binary_search(bool *success, int, int size)
+{
+    // test binary search on [0, warp_size)
+    // for all possible partition points
+    auto result = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x;
+    });
+    auto result2 = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x + 1;
+    });
+    test_assert(success, result == threadIdx.x);
+    test_assert(success, result2 == threadIdx.x + 1);
+}
+
+TEST_F(Searching, SyncBinary)
+{
+    run_test(test_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_empty_sync_binary_search(bool *success, int, int)
+{
+    auto result = synchronous_binary_search(0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == 0);
+}
+
+TEST_F(Searching, EmptySyncBinary)
+{
+    run_test(test_empty_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_warp_ary_search(bool *success, int offset, int size)
+{
+    // test binary search on [0, length)
+    // for all possible partition points
+    auto warp = group::tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_ary_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpAryNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpAryOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 134, size, size + 1);
+    }
+}
+
+
+__global__ void test_warp_wide_search(bool *success, int offset, int size)
+{
+    // test binary search on [0, length)
+    // for all possible partition points
+    auto warp = group::tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_wide_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpWideNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpWideOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 142, size, size + 1);
+    }
+}
+
+
+}  // namespace
diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp
new file mode 100644
index 00000000000..8211121afa4
--- /dev/null
+++ b/hip/components/searching.hip.hpp
@@ -0,0 +1,55 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_SEARCHING_CUH_
+#define GKO_HIP_COMPONENTS_SEARCHING_CUH_
+
+
+#include "hip/base/config.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/intrinsics.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/searching.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_SEARCHING_CUH_
diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt
index 2783af3fbbc..c0fbe34f6aa 100644
--- a/hip/test/components/CMakeLists.txt
+++ b/hip/test/components/CMakeLists.txt
@@ -1,3 +1,4 @@
 ginkgo_create_hip_test(cooperative_groups)
 ginkgo_create_hip_test(prefix_sum)
+ginkgo_create_hip_test(searching)
 ginkgo_create_hip_test(sorting)
\ No newline at end of file
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
new file mode 100644
index 00000000000..53cf4fc6b53
--- /dev/null
+++ b/hip/test/components/searching.hip.cpp
@@ -0,0 +1,249 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// TODO remove when the HIP includes are fixed
+#include <hip/hip_runtime.h>
+
+
+#include "hip/components/searching.hip.hpp"
+
+
+#include <memory>
+#include <numeric>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::hip;
+using namespace gko::kernels::hip::group;
+
+
+class Searching : public ::testing::Test {
+protected:
+    Searching()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref)),
+          result(ref, 1),
+          dresult(hip),
+          sizes(14203)
+    {
+        std::iota(sizes.begin(), sizes.end(), 0);
+    }
+
+    template <typename Kernel>
+    void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1)
+    {
+        *result.get_data() = true;
+        dresult = result;
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(num_blocks),
+                           dim3(config::warp_size), 0, 0, dresult.get_data(),
+                           offset, size);
+        result = dresult;
+        auto success = *result.get_const_data();
+
+        ASSERT_TRUE(success);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    gko::Array<bool> result;
+    gko::Array<bool> dresult;
+    std::vector<int> sizes;
+};
+
+
+__device__ void test_assert(bool *success, bool predicate)
+{
+    if (!predicate) {
+        *success = false;
+    }
+}
+
+
+__global__ void test_binary_search(bool *success, int offset, int size)
+{
+    // test binary search on [0, size)
+    // for all possible partition points
+    auto result = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset;
+    });
+    auto result2 = binary_search(offset, size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= threadIdx.x + offset + 1;
+    });
+    test_assert(success, result == threadIdx.x + offset);
+    test_assert(success, result2 == threadIdx.x + offset + 1);
+}
+
+TEST_F(Searching, BinaryNoOffset)
+{
+    run_test(test_binary_search, 0, config::warp_size);
+}
+
+TEST_F(Searching, BinaryOffset)
+{
+    run_test(test_binary_search, 5, config::warp_size);
+}
+
+
+__global__ void test_empty_binary_search(bool *success, int offset, int)
+{
+    auto result = binary_search(offset, 0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == offset);
+}
+
+TEST_F(Searching, BinaryEmptyNoOffset)
+{
+    run_test(test_empty_binary_search, 0, 0);
+}
+
+TEST_F(Searching, BinaryEmptyOffset)
+{
+    run_test(test_empty_binary_search, 5, 0);
+}
+
+
+__global__ void test_sync_binary_search(bool *success, int, int size)
+{
+    // test binary search on [0, warp_size)
+    // for all possible partition points
+    auto result = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x;
+    });
+    auto result2 = synchronous_binary_search(size, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= 0 && i < size);
+        return i >= threadIdx.x + 1;
+    });
+    test_assert(success, result == threadIdx.x);
+    test_assert(success, result2 == threadIdx.x + 1);
+}
+
+TEST_F(Searching, SyncBinary)
+{
+    run_test(test_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_empty_sync_binary_search(bool *success, int, int)
+{
+    auto result = synchronous_binary_search(0, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, false);
+        return false;
+    });
+    test_assert(success, result == 0);
+}
+
+TEST_F(Searching, EmptySyncBinary)
+{
+    run_test(test_empty_sync_binary_search, 0, config::warp_size);
+}
+
+
+__global__ void test_warp_ary_search(bool *success, int offset, int size)
+{
+    // test binary search on [0, length)
+    // for all possible partition points
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_ary_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpAryNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpAryOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_ary_search, 134, size, size + 1);
+    }
+}
+
+
+__global__ void test_warp_wide_search(bool *success, int offset, int size)
+{
+    // test binary search on [0, length)
+    // for all possible partition points
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto result = group_wide_search(offset, size, warp, [&](int i) {
+        // don't access out-of-bounds!
+        test_assert(success, i >= offset && i < offset + size);
+        return i >= blockIdx.x + offset;
+    });
+    test_assert(success, result == blockIdx.x + offset);
+}
+
+TEST_F(Searching, WarpWideNoOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 0, size, size + 1);
+    }
+}
+
+TEST_F(Searching, WarpWideOffset)
+{
+    for (auto size : sizes) {
+        run_test(test_warp_wide_search, 142, size, size + 1);
+    }
+}
+
+
+}  // namespace

From 94bc786080d92cf8e8fffbc4ebd64c13e96f9d93 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 19 Dec 2019 08:45:57 +0100
Subject: [PATCH 2/8] add merging algorithms and tests

---
 common/components/merging.hpp.inc   | 373 ++++++++++++++++++++++++++++
 cuda/components/merging.cuh         |  54 ++++
 cuda/test/components/CMakeLists.txt |   1 +
 cuda/test/components/merging.cu     | 257 +++++++++++++++++++
 hip/components/merging.hip.hpp      |  54 ++++
 hip/test/components/CMakeLists.txt  |   1 +
 hip/test/components/merging.hip.cpp | 268 ++++++++++++++++++++
 7 files changed, 1008 insertions(+)
 create mode 100644 common/components/merging.hpp.inc
 create mode 100644 cuda/components/merging.cuh
 create mode 100644 cuda/test/components/merging.cu
 create mode 100644 hip/components/merging.hip.hpp
 create mode 100644 hip/test/components/merging.hip.cpp

diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
new file mode 100644
index 00000000000..59a9b84a245
--- /dev/null
+++ b/common/components/merging.hpp.inc
@@ -0,0 +1,373 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace detail {
+
+
+/**
+ * @internal
+ * The result from the @ref parallel_merge_step function.
+ */
+template <typename ValueType>
+struct merge_result {
+    /** The element of a being merged in the current thread. */
+    ValueType a_val;
+    /** The element of b being merged in the current thread. */
+    ValueType b_val;
+    /** The index from a that is being merged in the current thread. */
+    int a_idx;
+    /** The index from b that is being merged in the current thread. */
+    int b_idx;
+    /** The number of elements from a that have been merged in total. */
+    int a_advance;
+    /** The number of elements from b that have been merged in total. */
+    int b_advance;
+};
+
+
+template <typename ValueType, typename IndexType>
+__device__ ValueType
+checked_load(const ValueType *p, IndexType i, IndexType size,
+             ValueType sentinel = device_numeric_limits<ValueType>::max)
+{
+    return i < size ? p[i] : sentinel;
+}
+
+
+}  // namespace detail
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that merges the first `warp_size` elements from
+ * two ranges, where each warp stores a single element from each range.
+ * It assumes that the elements are sorted in ascending order, i.e. for i < j,
+ * the value of `a` at thread i is smaller or equal the value at thread j, and
+ * the same holds for `b`.
+ *
+ * @param a      the element from the first range
+ * @param b      the element from the second range
+ * @param size   the number of elements in the output range
+ * @param group  the cooperative group that executes the merge
+ * @return  a structure containing the merge result distributed over the group.
+ */
+template <typename ValueType, typename Group>
+__device__ detail::merge_result<ValueType> group_merge_step(ValueType a,
+                                                            ValueType b,
+                                                            int size,
+                                                            Group group)
+{
+    // round up to the next power of two
+    auto size_pow2 = 1 << (32 - clz(uint32(size - 1)));
+    // thread i takes care of ith element of the merged sequence
+    auto i = int(group.thread_rank());
+
+    // we want to find the smallest index `x` such that a[x] >= b[i - x - 1]
+    // or `i` if no such index exists
+    //
+    // if x = i then c[0...i - 1] = a[0...i - 1]
+    //     => merge a[i] with b[0]
+    // if x = 0 then c[0...i - 1] = b[0...i - 1]
+    //     => merge a[0] with b[i]
+    // otherwise c[0...i - 1] contains a[0...x - 1] and b[0...i - x - 1]
+    //   because the minimality of `x` implies
+    //   b[i - x] >= a[x - 1]
+    //   and a[x] >= a[0...x - 1], b[0...i - x - 1]
+    //     => merge a[x] with b[i - x]
+    auto minx = synchronous_binary_search(size_pow2, [&](int x) {
+        // potentially faster (because of unrolling):
+        // auto minx = synchronous_fixed_binary_search<group.size()>([&](int x){
+        auto a_remote = group.shfl(a, x);
+        auto b_remote = group.shfl(b, max(i - x - 1, 0));
+        return a_remote >= b_remote || x >= i;
+    });
+
+    auto a_idx = minx;
+    auto b_idx = max(i - minx, 0);
+    auto a_val = group.shfl(a, a_idx);
+    auto b_val = group.shfl(b, b_idx);
+    auto cmp = a_val < b_val;
+    auto a_advance = int(popcnt(group.ballot(cmp)));
+    auto b_advance = int(group.size()) - a_advance;
+
+    return {a_val, b_val, a_idx, b_idx, a_advance, b_advance};
+}
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary
+ * size. `merge_fun` will be called for each merged element.
+ *
+ * @param a       the first range
+ * @param a_size  the size of the first range
+ * @param b       the second range
+ * @param b_size  the size of the second range
+ * @param group   the group that executes the merge
+ * @param merge_fn  the callback that is being called for each merged element.
+ *                  It takes five parameters:
+ *                  `IndexType a_idx, ValueType a_val,
+ *                   IndexType b_idx, ValueType b_val, IndexType c_index`.
+ *                   `*_val` and `*_idx` are the values resp. the indices of the
+ *                   values from a/b being compared at output index `c_index`.
+ */
+template <typename IndexType, typename ValueType, typename Group,
+          typename Callback>
+__device__ void group_merge(const ValueType *a, IndexType a_size,
+                            const ValueType *b, IndexType b_size, Group group,
+                            Callback merge_fn)
+{
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    IndexType c_begin{};
+    auto lane = IndexType(group.thread_rank());
+    auto a_cur = detail::checked_load(a, a_begin + lane, a_size);
+    auto b_cur = detail::checked_load(b, b_begin + lane, a_size);
+    while (c_begin < c_size) {
+        auto merge_size = min<int>(group.size(), c_size - c_begin);
+        auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group);
+        if (c_begin + lane < c_size) {
+            merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx,
+                     merge_result.b_val, c_begin + lane);
+        }
+        auto a_advance = merge_result.a_advance;
+        auto b_advance = merge_result.b_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+        c_begin += group.size();
+
+        // shuffle the unmerged elements to the front
+        a_cur = group.shfl_down(a_cur, a_advance);
+        b_cur = group.shfl_down(b_cur, b_advance);
+        /*
+         * To optimize memory access, we load the new elements for `a` and `b`
+         * with a single load instruction:
+         * the lower part of the group loads new elements for `a`
+         * the upper part of the group loads new elements for `b`
+         * `load_lane` is the part-local lane idx
+         * The elements for `a` have to be shuffled up afterwards.
+         */
+        auto load_a = lane < a_advance;
+        auto load_lane = load_a ? lane : lane - a_advance;
+        auto load_source = load_a ? a : b;
+        auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance;
+        auto load_size = load_a ? a_size : b_size;
+
+        auto load_idx = load_begin + load_lane;
+        auto loaded = detail::checked_load(load_source, load_idx, load_size);
+        // shuffle the `a` values to the end of the warp
+        auto lower_loaded = group.shfl_up(loaded, b_advance);
+        a_cur = lane < b_advance ? a_cur : lower_loaded;
+        b_cur = lane < a_advance ? b_cur : loaded;
+    }
+}
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary
+ * size, where the first range is stored in two halves.
+ * `merge_fun` will be called for each merged element.
+ *
+ * @param a1  the first half of the first range
+ * @param a1_begin  the beginning offset of the first half of the first range
+ * @param a1_size the size of the first half of the first range.
+ * @param a2  the second half of the first range
+ * @param a2_begin  the beginning offset of the second half of the first range
+ * @param a2_size the size of the second half of the first range.
+ * @param b  the second range
+ * @param b_size the size of the second range
+ * @param group   the group that executes the merge
+ * @param merge_fn  the callback that is being called for each merged element.
+ *                  It takes five parameters:
+ *                  `IndexType a_idx, ValueType a_val,
+ *                   IndexType b_idx, ValueType b_val, IndexType c_index`.
+ *                   `*_val` and `*_idx` are the values resp. the indices of the
+ *                   values from a/b being compared at output index `c_index`.
+ */
+template <typename IndexType, typename ValueType, typename Group,
+          typename Callback>
+__device__ void group_merge_3way(const ValueType *a1, IndexType a1_size,
+                                 const ValueType *a2, IndexType a2_size,
+                                 const ValueType *b, IndexType b_size,
+                                 Group group, Callback merge_fn)
+{
+    auto a_size = a1_size + a2_size;
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    IndexType c_begin{};
+    auto lane = IndexType(group.thread_rank());
+    auto a = lane < a1_size ? a1 : a2 - a1_size;
+    auto a_cur = detail::checked_load(a, a_begin + lane, a_size);
+    auto b_cur = detail::checked_load(b, b_begin + lane, a_size);
+    while (c_begin < c_size) {
+        auto merge_size = min<int>(group.size(), c_size - c_begin);
+        auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group);
+        if (c_begin + lane < c_size) {
+            merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx,
+                     merge_result.b_val, c_begin + lane);
+        }
+        auto a_advance = merge_result.a_advance;
+        auto b_advance = merge_result.b_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+        c_begin += group.size();
+        a = a_begin + lane < a1_size ? a1 : a2 - a1_size;
+
+        // shuffle the unmerged elements to the front
+        a_cur = group.shfl_down(a_cur, a_advance);
+        b_cur = group.shfl_down(b_cur, b_advance);
+        /*
+         * To optimize memory access, we load the new elements for `a` and `b`
+         * with a single load instruction:
+         * the lower part of the group loads new elements for `a`
+         * the upper part of the group loads new elements for `b`
+         * `load_lane` is the part-local lane idx
+         * The elements for `a` have to be shuffled up afterwards.
+         */
+        auto load_a = lane < a_advance;
+        auto load_lane = load_a ? lane : lane - a_advance;
+        auto load_source = load_a ? a : b;
+        auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance;
+        auto load_size = load_a ? a_size : b_size;
+
+        auto load_idx = load_begin + load_lane;
+        auto loaded = detail::checked_load(load_source, load_idx, load_size);
+        // shuffle the `a` values to the end of the warp
+        auto lower_loaded = group.shfl_up(loaded, b_advance);
+        a_cur = lane < b_advance ? a_cur : lower_loaded;
+        b_cur = lane < a_advance ? b_cur : loaded;
+    }
+}
+
+
+/**
+ * @internal
+ * Sequential merge algorithm that merges two sorted ranges of arbitrary
+ * size. `merge_fun` will be called for each merged element.
+ *
+ * @param a  the first range
+ * @param a_size the size of the first range
+ * @param b  the second range
+ * @param b_size the size of the second range
+ * @param merge_fn  the callback that is being called for each merged element.
+ *                  It takes five parameters:
+ *                  `IndexType a_idx, ValueType a_val,
+ *                   IndexType b_idx, ValueType b_val, IndexType c_index`.
+ *                   `*_val` and `*_idx` are the values resp. the indices of the
+ *                   values from a/b being compared at output index `c_index`.
+ */
+template <typename IndexType, typename ValueType, typename Callback>
+__device__ void sequential_merge(const ValueType *a, IndexType a_size,
+                                 const ValueType *b, IndexType b_size,
+                                 Callback merge_fn)
+{
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    IndexType c_begin{};
+    auto a_cur = detail::checked_load(a, a_begin, a_size);
+    auto b_cur = detail::checked_load(b, b_begin, b_size);
+    while (c_begin < c_size) {
+        merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin);
+        auto a_advance = a_cur < b_cur;
+        auto b_advance = !a_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+        c_begin++;
+
+        auto load = a_advance ? a : b;
+        auto load_size = a_advance ? a_size : b_size;
+        auto load_idx = a_advance ? a_begin : b_begin;
+        auto loaded = detail::checked_load(load, load_idx, load_size);
+        a_cur = a_advance ? loaded : a_cur;
+        b_cur = b_advance ? loaded : b_cur;
+    }
+}
+
+
+/**
+ * @internal
+ * Sequential merge algorithm that merges two sorted ranges of arbitrary
+ * size, where the first range is stored in two halves.
+ * `merge_fun` will be called for each merged element.
+ *
+ * @param a1  the first half of the first range
+ * @param a1_begin  the beginning offset of the first half of the first range
+ * @param a1_size the size of the first half of the first range.
+ * @param a2  the second half of the first range
+ * @param a2_begin  the beginning offset of the second half of the first range
+ * @param a2_size the size of the second half of the first range.
+ * @param b  the second range
+ * @param b_size the size of the second range
+ * @param merge_fn  the callback that is being called for each merged element.
+ *                  It takes five parameters:
+ *                  `IndexType a_idx, ValueType a_val,
+ *                   IndexType b_idx, ValueType b_val, IndexType c_index`.
+ *                   `*_val` and `*_idx` are the values resp. the indices of the
+ *                   values from a/b being compared at output index `c_index`.
+ */
+template <typename IndexType, typename ValueType, typename Callback>
+__device__ void sequential_merge_3way(const ValueType *a1, IndexType a1_size,
+                                      const ValueType *a2, IndexType a2_size,
+                                      const ValueType *b, IndexType b_size,
+                                      Callback merge_fn)
+{
+    auto a = a1_size > 0 ? a1 : a2;
+    auto a_size = a1_size + a2_size;
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    IndexType c_begin{};
+    auto a_cur = detail::checked_load(a, a_begin, a_size);
+    auto b_cur = detail::checked_load(b, b_begin, b_size);
+    while (c_begin < c_size) {
+        merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin);
+        auto a_advance = a_cur < b_cur;
+        auto b_advance = !a_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+        c_begin++;
+        if (a_begin == a1_size) {
+            a = a2 - a1_size;
+        }
+
+        auto load = a_advance ? a : b;
+        auto load_size = a_advance ? a_size : b_size;
+        auto load_idx = a_advance ? a_begin : b_begin;
+        auto loaded = detail::checked_load(load, load_idx, load_size);
+        a_cur = a_advance ? loaded : a_cur;
+        b_cur = b_advance ? loaded : b_cur;
+    }
+}
\ No newline at end of file
diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh
new file mode 100644
index 00000000000..b1a2ffa0f21
--- /dev/null
+++ b/cuda/components/merging.cuh
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_
+#define GKO_CUDA_COMPONENTS_MERGING_CUH_
+
+
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/searching.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+#include "common/components/merging.hpp.inc"
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_COMPONENTS_MERGING_CUH_
diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt
index da1781d80db..73d7c90aebd 100644
--- a/cuda/test/components/CMakeLists.txt
+++ b/cuda/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_cuda_test(cooperative_groups)
+ginkgo_create_cuda_test(merging)
 ginkgo_create_cuda_test(prefix_sum)
 ginkgo_create_cuda_test(searching)
 ginkgo_create_cuda_test(sorting)
\ No newline at end of file
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
new file mode 100644
index 00000000000..9b8123a960d
--- /dev/null
+++ b/cuda/test/components/merging.cu
@@ -0,0 +1,257 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/components/merging.cuh"
+
+
+#include <algorithm>
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/components/cooperative_groups.cuh"
+
+
+namespace {
+
+
+using namespace gko::kernels::cuda;
+using namespace cooperative_groups;
+
+
+class Merging : public ::testing::Test {
+protected:
+    Merging()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref)),
+          rng(123456),
+          rng_runs{100},
+          rng_run{},
+          max_size{1637},
+          sizes{0,  1,  2,   3,   4,   10,  15,   16,
+                31, 34, 102, 242, 534, 956, 1239, 1637},
+          data1(ref, max_size),
+          data2(ref, max_size),
+          outdata(ref, 2 * max_size),
+          refdata(ref, 2 * max_size),
+          ddata1(cuda),
+          ddata2(cuda),
+          doutdata(cuda, 2 * max_size)
+    {}
+
+    void init_data()
+    {
+        std::uniform_int_distribution<gko::int32> dist(0, max_size);
+        std::fill_n(data1.get_data(), max_size, 0);
+        std::fill_n(data2.get_data(), max_size, 0);
+        for (auto i = 0; i < max_size; ++i) {
+            // here we also want to test some corner cases
+            // first two runs: zero data1
+            if (rng_run > 0) data1.get_data()[i] = dist(rng);
+            // first and third run: zero data2
+            if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng);
+        }
+        std::sort(data1.get_data(), data1.get_data() + max_size);
+        std::sort(data2.get_data(), data2.get_data() + max_size);
+
+        ddata1 = data1;
+        ddata2 = data2;
+    }
+
+    void assert_eq_ref(int size, int eq_size)
+    {
+        outdata = doutdata;
+        auto out_ptr = outdata.get_const_data();
+        auto out_end = out_ptr + eq_size;
+        auto ref_ptr = refdata.get_data();
+        std::copy_n(data1.get_const_data(), size, ref_ptr);
+        std::copy_n(data2.get_const_data(), size, ref_ptr + size);
+        std::sort(ref_ptr, ref_ptr + 2 * size);
+
+        ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+    std::default_random_engine rng;
+
+    int rng_runs;
+    int rng_run;
+    int max_size;
+    std::vector<int> sizes;
+    gko::Array<gko::int32> data1;
+    gko::Array<gko::int32> data2;
+    gko::Array<gko::int32> outdata;
+    gko::Array<gko::int32> refdata;
+    gko::Array<gko::int32> ddata1;
+    gko::Array<gko::int32> ddata2;
+    gko::Array<gko::int32> doutdata;
+};
+
+
+__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
+                                gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto i = warp.thread_rank();
+    auto result = group_merge_step(a[i], b[i], config::warp_size, warp);
+    c[i] = min(result.a_val, result.b_val);
+}
+
+TEST_F(Merging, MergeStep)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        test_merge_step<<<1, config::warp_size>>>(ddata1.get_const_data(),
+                                                  ddata2.get_const_data(),
+                                                  doutdata.get_data());
+
+        assert_eq_ref(config::warp_size, config::warp_size);
+    }
+}
+
+
+__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
+                           gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge(a, size, b, size, warp,
+                [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val,
+                    int i) { c[i] = min(a_val, b_val); });
+}
+
+TEST_F(Merging, FullMerge)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            test_merge<<<1, config::warp_size>>>(ddata1.get_const_data(),
+                                                 ddata2.get_const_data(), size,
+                                                 doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b,
+                                int size, int separator, gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge_3way(
+        a, separator, a + separator, size - separator, b, size, warp,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
+}
+
+TEST_F(Merging, FullMerge3Way)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            for (auto separator :
+                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
+                test_merge_3way<<<1, config::warp_size>>>(
+                    ddata1.get_const_data(), ddata2.get_const_data(), size,
+                    separator, doutdata.get_data());
+
+                assert_eq_ref(size, 2 * size);
+            }
+        }
+    }
+}
+
+
+__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
+                                      int size, gko::int32 *c)
+{
+    sequential_merge(
+        a, size, b, size,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
+}
+
+TEST_F(Merging, SequentialFullMerge)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            test_sequential_merge<<<1, 1>>>(ddata1.get_const_data(),
+                                            ddata2.get_const_data(), size,
+                                            doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_sequential_merge_3way(const gko::int32 *a,
+                                           const gko::int32 *b, int size,
+                                           int separator, gko::int32 *c)
+{
+    sequential_merge_3way(
+        a, separator, a + separator, size - separator, b, size,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
+}
+
+TEST_F(Merging, SequentialFull3WayMerge)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            for (auto separator :
+                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
+                test_sequential_merge_3way<<<1, 1>>>(
+                    ddata1.get_const_data(), ddata2.get_const_data(), size,
+                    separator, doutdata.get_data());
+
+                assert_eq_ref(size, 2 * size);
+            }
+        }
+    }
+}
+
+
+}  // namespace
diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp
new file mode 100644
index 00000000000..a236784d389
--- /dev/null
+++ b/hip/components/merging.hip.hpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_MERGING_CUH_
+#define GKO_HIP_COMPONENTS_MERGING_CUH_
+
+
+#include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/searching.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/components/merging.hpp.inc"
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_COMPONENTS_MERGING_CUH_
diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt
index c0fbe34f6aa..30569774eb3 100644
--- a/hip/test/components/CMakeLists.txt
+++ b/hip/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_hip_test(cooperative_groups)
+ginkgo_create_hip_test(merging)
 ginkgo_create_hip_test(prefix_sum)
 ginkgo_create_hip_test(searching)
 ginkgo_create_hip_test(sorting)
\ No newline at end of file
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
new file mode 100644
index 00000000000..fd12a71a54e
--- /dev/null
+++ b/hip/test/components/merging.hip.cpp
@@ -0,0 +1,268 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// TODO remove when the HIP includes are fixed
+#include <hip/hip_runtime.h>
+
+
+#include "hip/components/merging.hip.hpp"
+
+
+#include <algorithm>
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace {
+
+
+using namespace gko::kernels::hip;
+using namespace gko::kernels::hip::group;
+
+
+class Merging : public ::testing::Test {
+protected:
+    Merging()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref)),
+          rng(123456),
+          rng_runs{100},
+          rng_run{},
+          max_size{1637},
+          sizes{0,  1,  2,   3,   4,   10,  15,   16,
+                31, 34, 102, 242, 534, 956, 1239, 1637},
+          data1(ref, max_size),
+          data2(ref, max_size),
+          outdata(ref, 2 * max_size),
+          refdata(ref, 2 * max_size),
+          ddata1(hip),
+          ddata2(hip),
+          doutdata(hip, 2 * max_size)
+    {}
+
+    void init_data()
+    {
+        std::uniform_int_distribution<gko::int32> dist(0, max_size);
+        std::fill_n(data1.get_data(), max_size, 0);
+        std::fill_n(data2.get_data(), max_size, 0);
+        for (auto i = 0; i < max_size; ++i) {
+            // here we also want to test some corner cases
+            // first two runs: zero data1
+            if (rng_run > 0) data1.get_data()[i] = dist(rng);
+            // first and third run: zero data2
+            if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng);
+        }
+        std::sort(data1.get_data(), data1.get_data() + max_size);
+        std::sort(data2.get_data(), data2.get_data() + max_size);
+
+        ddata1 = data1;
+        ddata2 = data2;
+    }
+
+    void assert_eq_ref(int size, int eq_size)
+    {
+        outdata = doutdata;
+        auto out_ptr = outdata.get_const_data();
+        auto out_end = out_ptr + eq_size;
+        auto ref_ptr = refdata.get_data();
+        std::copy_n(data1.get_const_data(), size, ref_ptr);
+        std::copy_n(data2.get_const_data(), size, ref_ptr + size);
+        std::sort(ref_ptr, ref_ptr + 2 * size);
+
+        ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+    std::default_random_engine rng;
+
+    int rng_runs;
+    int rng_run;
+    int max_size;
+    std::vector<int> sizes;
+    gko::Array<gko::int32> data1;
+    gko::Array<gko::int32> data2;
+    gko::Array<gko::int32> outdata;
+    gko::Array<gko::int32> refdata;
+    gko::Array<gko::int32> ddata1;
+    gko::Array<gko::int32> ddata2;
+    gko::Array<gko::int32> doutdata;
+};
+
+
+__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
+                                gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    auto i = warp.thread_rank();
+    auto result = group_merge_step(a[i], b[i], config::warp_size, warp);
+    c[i] = min(result.a_val, result.b_val);
+}
+
+TEST_F(Merging, MergeStep)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), dim3(1),
+                           dim3(config::warp_size), 0, 0,
+                           ddata1.get_const_data(), ddata2.get_const_data(),
+                           doutdata.get_data());
+
+        assert_eq_ref(config::warp_size, config::warp_size);
+    }
+}
+
+
+__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
+                           gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge(a, size, b, size, warp,
+                [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val,
+                    int i) { c[i] = min(a_val, b_val); });
+}
+
+TEST_F(Merging, FullMerge)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), dim3(1),
+                               dim3(config::warp_size), 0, 0,
+                               ddata1.get_const_data(), ddata2.get_const_data(),
+                               size, doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b,
+                                int size, int separator, gko::int32 *c)
+{
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge_3way(
+        a, separator, a + separator, size - separator, b, size, warp,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
+}
+
+TEST_F(Merging, FullMerge3Way)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            for (auto separator :
+                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
+                hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_3way), dim3(1),
+                                   dim3(config::warp_size), 0, 0,
+                                   ddata1.get_const_data(),
+                                   ddata2.get_const_data(), size, separator,
+                                   doutdata.get_data());
+
+                assert_eq_ref(size, 2 * size);
+            }
+        }
+    }
+}
+
+
+__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
+                                      int size, gko::int32 *c)
+{
+    sequential_merge(
+        a, size, b, size,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
+}
+
+TEST_F(Merging, SequentialFullMerge)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), dim3(1),
+                               dim3(1), 0, 0, ddata1.get_const_data(),
+                               ddata2.get_const_data(), size,
+                               doutdata.get_data());
+
+            assert_eq_ref(size, 2 * size);
+        }
+    }
+}
+
+
+__global__ void test_sequential_merge_3way(const gko::int32 *a,
+                                           const gko::int32 *b, int size,
+                                           int separator, gko::int32 *c)
+{
+    sequential_merge_3way(
+        a, separator, a + separator, size - separator, b, size,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
+}
+
+TEST_F(Merging, SequentialFull3WayMerge)
+{
+    for (auto i = 0; i < rng_runs; ++i) {
+        init_data();
+        for (auto size : sizes) {
+            for (auto separator :
+                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
+                hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge_3way),
+                                   dim3(1), dim3(1), 0, 0,
+                                   ddata1.get_const_data(),
+                                   ddata2.get_const_data(), size, separator,
+                                   doutdata.get_data());
+
+                assert_eq_ref(size, 2 * size);
+            }
+        }
+    }
+}
+
+
+}  // namespace

From 9719fdc66de70f100e3fa53d75e5d288ad52859a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 22 Jan 2020 08:41:10 +0100
Subject: [PATCH 3/8] add references to algorithms

---
 common/components/merging.hpp.inc   | 4 ++++
 common/components/searching.hpp.inc | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
index 59a9b84a245..ef109e2837a 100644
--- a/common/components/merging.hpp.inc
+++ b/common/components/merging.hpp.inc
@@ -74,6 +74,10 @@ checked_load(const ValueType *p, IndexType i, IndexType size,
  * the value of `a` at thread i is smaller or equal the value at thread j, and
  * the same holds for `b`.
  *
+ * This implementation is based on ideas from  Green et al.,
+ * "GPU merge path: a GPU merging algorithm", but uses random-access warp
+ * shuffles instead of shared-memory to exchange values of a and b.
+ *
  * @param a      the element from the first range
  * @param b      the element from the second range
  * @param size   the number of elements in the output range
diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc
index 222ab85de33..ccf4bdc6f80 100644
--- a/common/components/searching.hpp.inc
+++ b/common/components/searching.hpp.inc
@@ -139,6 +139,9 @@ __device__ int synchronous_binary_search(int size, Predicate p)
  *
  * It executes `log2(length / group.size())` coalescing calls to `p`.
  *
+ * This implementation is based on the w-wide search mentioned in
+ * Green et al., "GPU merge path: a GPU merging algorithm"
+ *
  * @param offset  the starting index of the partitioned range
  * @param length  the length of the partitioned range
  * @param group   the coalescing group executing the search
@@ -190,6 +193,9 @@ __device__ IndexType group_wide_search(IndexType offset, IndexType length,
  * It executes `log2(length) / log2(group.size())` calls to `p` that effectively
  * follow a random-access pattern.
  *
+ * This implementation is based on the w-partition search mentioned in
+ * Green et al., "GPU merge path: a GPU merging algorithm"
+ *
  * @param offset  the starting index of the partitioned range
  * @param length  the length of the partitioned range
  * @param group   the coalescing group executing the search

From 0fcdb2164b4a91a497182f5383c02f539403b047 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 22 Jan 2020 10:29:43 +0100
Subject: [PATCH 4/8] remove 3way merge, test all callback parameters

---
 common/components/merging.hpp.inc   | 140 +---------------------------
 cuda/test/components/merging.cu     | 114 +++++++++++++---------
 hip/test/components/merging.hip.cpp | 120 ++++++++++++++----------
 3 files changed, 147 insertions(+), 227 deletions(-)

diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
index ef109e2837a..368fb7a1ecf 100644
--- a/common/components/merging.hpp.inc
+++ b/common/components/merging.hpp.inc
@@ -161,8 +161,9 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
         auto merge_size = min<int>(group.size(), c_size - c_begin);
         auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group);
         if (c_begin + lane < c_size) {
-            merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx,
-                     merge_result.b_val, c_begin + lane);
+            merge_fn(merge_result.a_idx + a_begin, merge_result.a_val,
+                     merge_result.b_idx + b_begin, merge_result.b_val,
+                     c_begin + lane);
         }
         auto a_advance = merge_result.a_advance;
         auto b_advance = merge_result.b_advance;
@@ -197,85 +198,6 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
 }
 
 
-/**
- * @internal
- * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary
- * size, where the first range is stored in two halves.
- * `merge_fun` will be called for each merged element.
- *
- * @param a1  the first half of the first range
- * @param a1_begin  the beginning offset of the first half of the first range
- * @param a1_size the size of the first half of the first range.
- * @param a2  the second half of the first range
- * @param a2_begin  the beginning offset of the second half of the first range
- * @param a2_size the size of the second half of the first range.
- * @param b  the second range
- * @param b_size the size of the second range
- * @param group   the group that executes the merge
- * @param merge_fn  the callback that is being called for each merged element.
- *                  It takes five parameters:
- *                  `IndexType a_idx, ValueType a_val,
- *                   IndexType b_idx, ValueType b_val, IndexType c_index`.
- *                   `*_val` and `*_idx` are the values resp. the indices of the
- *                   values from a/b being compared at output index `c_index`.
- */
-template <typename IndexType, typename ValueType, typename Group,
-          typename Callback>
-__device__ void group_merge_3way(const ValueType *a1, IndexType a1_size,
-                                 const ValueType *a2, IndexType a2_size,
-                                 const ValueType *b, IndexType b_size,
-                                 Group group, Callback merge_fn)
-{
-    auto a_size = a1_size + a2_size;
-    auto c_size = a_size + b_size;
-    IndexType a_begin{};
-    IndexType b_begin{};
-    IndexType c_begin{};
-    auto lane = IndexType(group.thread_rank());
-    auto a = lane < a1_size ? a1 : a2 - a1_size;
-    auto a_cur = detail::checked_load(a, a_begin + lane, a_size);
-    auto b_cur = detail::checked_load(b, b_begin + lane, a_size);
-    while (c_begin < c_size) {
-        auto merge_size = min<int>(group.size(), c_size - c_begin);
-        auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group);
-        if (c_begin + lane < c_size) {
-            merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx,
-                     merge_result.b_val, c_begin + lane);
-        }
-        auto a_advance = merge_result.a_advance;
-        auto b_advance = merge_result.b_advance;
-        a_begin += a_advance;
-        b_begin += b_advance;
-        c_begin += group.size();
-        a = a_begin + lane < a1_size ? a1 : a2 - a1_size;
-
-        // shuffle the unmerged elements to the front
-        a_cur = group.shfl_down(a_cur, a_advance);
-        b_cur = group.shfl_down(b_cur, b_advance);
-        /*
-         * To optimize memory access, we load the new elements for `a` and `b`
-         * with a single load instruction:
-         * the lower part of the group loads new elements for `a`
-         * the upper part of the group loads new elements for `b`
-         * `load_lane` is the part-local lane idx
-         * The elements for `a` have to be shuffled up afterwards.
-         */
-        auto load_a = lane < a_advance;
-        auto load_lane = load_a ? lane : lane - a_advance;
-        auto load_source = load_a ? a : b;
-        auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance;
-        auto load_size = load_a ? a_size : b_size;
-
-        auto load_idx = load_begin + load_lane;
-        auto loaded = detail::checked_load(load_source, load_idx, load_size);
-        // shuffle the `a` values to the end of the warp
-        auto lower_loaded = group.shfl_up(loaded, b_advance);
-        a_cur = lane < b_advance ? a_cur : lower_loaded;
-        b_cur = lane < a_advance ? b_cur : loaded;
-    }
-}
-
-
 /**
  * @internal
  * Sequential merge algorithm that merges two sorted ranges of arbitrary
@@ -311,62 +233,6 @@ __device__ void sequential_merge(const ValueType *a, IndexType a_size,
         b_begin += b_advance;
         c_begin++;
 
-        auto load = a_advance ? a : b;
-        auto load_size = a_advance ? a_size : b_size;
-        auto load_idx = a_advance ? a_begin : b_begin;
-        auto loaded = detail::checked_load(load, load_idx, load_size);
-        a_cur = a_advance ? loaded : a_cur;
-        b_cur = b_advance ? loaded : b_cur;
-    }
-}
-
-
-/**
- * @internal
- * Sequential merge algorithm that merges two sorted ranges of arbitrary
- * size, where the first range is stored in two halves.
- * `merge_fun` will be called for each merged element.
- *
- * @param a1  the first half of the first range
- * @param a1_begin  the beginning offset of the first half of the first range
- * @param a1_size the size of the first half of the first range.
- * @param a2  the second half of the first range
- * @param a2_begin  the beginning offset of the second half of the first range
- * @param a2_size the size of the second half of the first range.
- * @param b  the second range
- * @param b_size the size of the second range
- * @param merge_fn  the callback that is being called for each merged element.
- *                  It takes five parameters:
- *                  `IndexType a_idx, ValueType a_val,
- *                   IndexType b_idx, ValueType b_val, IndexType c_index`.
- *                   `*_val` and `*_idx` are the values resp. the indices of the
- *                   values from a/b being compared at output index `c_index`.
- */
-template <typename IndexType, typename ValueType, typename Callback>
-__device__ void sequential_merge_3way(const ValueType *a1, IndexType a1_size,
-                                      const ValueType *a2, IndexType a2_size,
-                                      const ValueType *b, IndexType b_size,
-                                      Callback merge_fn)
-{
-    auto a = a1_size > 0 ? a1 : a2;
-    auto a_size = a1_size + a2_size;
-    auto c_size = a_size + b_size;
-    IndexType a_begin{};
-    IndexType b_begin{};
-    IndexType c_begin{};
-    auto a_cur = detail::checked_load(a, a_begin, a_size);
-    auto b_cur = detail::checked_load(b, b_begin, b_size);
-    while (c_begin < c_size) {
-        merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin);
-        auto a_advance = a_cur < b_cur;
-        auto b_advance = !a_advance;
-        a_begin += a_advance;
-        b_begin += b_advance;
-        c_begin++;
-        if (a_begin == a1_size) {
-            a = a2 - a1_size;
-        }
-
         auto load = a_advance ? a : b;
         auto load_size = a_advance ? a_size : b_size;
         auto load_idx = a_advance ? a_begin : b_begin;
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 9b8123a960d..9b51aac04b1 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -69,9 +69,21 @@ protected:
           data1(ref, max_size),
           data2(ref, max_size),
           outdata(ref, 2 * max_size),
+          idxs1(ref),
+          idxs2(ref),
+          idxs3(ref),
+          refidxs1(ref),
+          refidxs2(ref),
+          refidxs3(ref),
           refdata(ref, 2 * max_size),
           ddata1(cuda),
           ddata2(cuda),
+          didxs1(cuda, 2 * max_size),
+          didxs2(cuda, 2 * max_size),
+          didxs3(cuda, 2 * max_size),
+          drefidxs1(cuda, 2 * max_size),
+          drefidxs2(cuda, 2 * max_size),
+          drefidxs3(cuda, 2 * max_size),
           doutdata(cuda, 2 * max_size)
     {}
 
@@ -117,10 +129,22 @@ protected:
     std::vector<int> sizes;
     gko::Array<gko::int32> data1;
     gko::Array<gko::int32> data2;
+    gko::Array<gko::int32> idxs1;
+    gko::Array<gko::int32> idxs2;
+    gko::Array<gko::int32> idxs3;
+    gko::Array<gko::int32> refidxs1;
+    gko::Array<gko::int32> refidxs2;
+    gko::Array<gko::int32> refidxs3;
     gko::Array<gko::int32> outdata;
     gko::Array<gko::int32> refdata;
     gko::Array<gko::int32> ddata1;
     gko::Array<gko::int32> ddata2;
+    gko::Array<gko::int32> didxs1;
+    gko::Array<gko::int32> didxs2;
+    gko::Array<gko::int32> didxs3;
+    gko::Array<gko::int32> drefidxs1;
+    gko::Array<gko::int32> drefidxs2;
+    gko::Array<gko::int32> drefidxs3;
     gko::Array<gko::int32> doutdata;
 };
 
@@ -171,35 +195,6 @@ TEST_F(Merging, FullMerge)
 }
 
 
-__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b,
-                                int size, int separator, gko::int32 *c)
-{
-    auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge_3way(
-        a, separator, a + separator, size - separator, b, size, warp,
-        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
-            c[i] = min(a_val, b_val);
-        });
-}
-
-TEST_F(Merging, FullMerge3Way)
-{
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
-        for (auto size : sizes) {
-            for (auto separator :
-                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
-                test_merge_3way<<<1, config::warp_size>>>(
-                    ddata1.get_const_data(), ddata2.get_const_data(), size,
-                    separator, doutdata.get_data());
-
-                assert_eq_ref(size, 2 * size);
-            }
-        }
-    }
-}
-
-
 __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
                                       int size, gko::int32 *c)
 {
@@ -225,30 +220,63 @@ TEST_F(Merging, SequentialFullMerge)
 }
 
 
-__global__ void test_sequential_merge_3way(const gko::int32 *a,
-                                           const gko::int32 *b, int size,
-                                           int separator, gko::int32 *c)
+__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
+                                int size, gko::int32 *c, gko::int32 *aidxs,
+                                gko::int32 *bidxs, gko::int32 *cidxs,
+                                gko::int32 *refaidxs, gko::int32 *refbidxs,
+                                gko::int32 *refcidxs)
 {
-    sequential_merge_3way(
-        a, separator, a + separator, size - separator, b, size,
+    if (threadIdx.x == 0) {
+        sequential_merge(a, size, b, size,
+                         [&](int a_idx, gko::int32 a_val, int b_idx,
+                             gko::int32 b_val, int i) {
+                             refaidxs[i] = a_idx;
+                             refbidxs[i] = b_idx;
+                             refcidxs[i] = i;
+                         });
+    }
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge(
+        a, size, b, size, warp,
         [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            aidxs[i] = a_idx;
+            bidxs[i] = b_idx;
+            cidxs[i] = i;
             c[i] = min(a_val, b_val);
         });
 }
 
-TEST_F(Merging, SequentialFull3WayMerge)
+TEST_F(Merging, FullMergeIdxs)
 {
     for (auto i = 0; i < rng_runs; ++i) {
         init_data();
         for (auto size : sizes) {
-            for (auto separator :
-                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
-                test_sequential_merge_3way<<<1, 1>>>(
-                    ddata1.get_const_data(), ddata2.get_const_data(), size,
-                    separator, doutdata.get_data());
-
-                assert_eq_ref(size, 2 * size);
-            }
+            test_merge_idxs<<<1, config::warp_size>>>(
+                ddata1.get_const_data(), ddata2.get_const_data(), size,
+                doutdata.get_data(), didxs1.get_data(), didxs2.get_data(),
+                didxs3.get_data(), drefidxs1.get_data(), drefidxs2.get_data(),
+                drefidxs3.get_data());
+
+            assert_eq_ref(size, 2 * size);
+            idxs1 = didxs1;
+            idxs2 = didxs2;
+            idxs3 = didxs3;
+            refidxs1 = drefidxs1;
+            refidxs2 = drefidxs2;
+            refidxs3 = drefidxs3;
+            auto idxs1_ptr = idxs1.get_const_data();
+            auto idxs2_ptr = idxs2.get_const_data();
+            auto idxs3_ptr = idxs3.get_const_data();
+            auto refidxs1_ptr = refidxs1.get_const_data();
+            auto refidxs2_ptr = refidxs2.get_const_data();
+            auto refidxs3_ptr = refidxs3.get_const_data();
+
+            ASSERT_TRUE(
+                std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr));
         }
     }
 }
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index fd12a71a54e..b281a5e8e1b 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -73,9 +73,21 @@ class Merging : public ::testing::Test {
           data1(ref, max_size),
           data2(ref, max_size),
           outdata(ref, 2 * max_size),
+          idxs1(ref),
+          idxs2(ref),
+          idxs3(ref),
+          refidxs1(ref),
+          refidxs2(ref),
+          refidxs3(ref),
           refdata(ref, 2 * max_size),
           ddata1(hip),
           ddata2(hip),
+          didxs1(hip, 2 * max_size),
+          didxs2(hip, 2 * max_size),
+          didxs3(hip, 2 * max_size),
+          drefidxs1(hip, 2 * max_size),
+          drefidxs2(hip, 2 * max_size),
+          drefidxs3(hip, 2 * max_size),
           doutdata(hip, 2 * max_size)
     {}
 
@@ -121,10 +133,22 @@ class Merging : public ::testing::Test {
     std::vector<int> sizes;
     gko::Array<gko::int32> data1;
     gko::Array<gko::int32> data2;
+    gko::Array<gko::int32> idxs1;
+    gko::Array<gko::int32> idxs2;
+    gko::Array<gko::int32> idxs3;
+    gko::Array<gko::int32> refidxs1;
+    gko::Array<gko::int32> refidxs2;
+    gko::Array<gko::int32> refidxs3;
     gko::Array<gko::int32> outdata;
     gko::Array<gko::int32> refdata;
     gko::Array<gko::int32> ddata1;
     gko::Array<gko::int32> ddata2;
+    gko::Array<gko::int32> didxs1;
+    gko::Array<gko::int32> didxs2;
+    gko::Array<gko::int32> didxs3;
+    gko::Array<gko::int32> drefidxs1;
+    gko::Array<gko::int32> drefidxs2;
+    gko::Array<gko::int32> drefidxs3;
     gko::Array<gko::int32> doutdata;
 };
 
@@ -177,37 +201,6 @@ TEST_F(Merging, FullMerge)
 }
 
 
-__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b,
-                                int size, int separator, gko::int32 *c)
-{
-    auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge_3way(
-        a, separator, a + separator, size - separator, b, size, warp,
-        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
-            c[i] = min(a_val, b_val);
-        });
-}
-
-TEST_F(Merging, FullMerge3Way)
-{
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
-        for (auto size : sizes) {
-            for (auto separator :
-                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
-                hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_3way), dim3(1),
-                                   dim3(config::warp_size), 0, 0,
-                                   ddata1.get_const_data(),
-                                   ddata2.get_const_data(), size, separator,
-                                   doutdata.get_data());
-
-                assert_eq_ref(size, 2 * size);
-            }
-        }
-    }
-}
-
-
 __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
                                       int size, gko::int32 *c)
 {
@@ -234,32 +227,65 @@ TEST_F(Merging, SequentialFullMerge)
 }
 
 
-__global__ void test_sequential_merge_3way(const gko::int32 *a,
-                                           const gko::int32 *b, int size,
-                                           int separator, gko::int32 *c)
+__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
+                                int size, gko::int32 *c, gko::int32 *aidxs,
+                                gko::int32 *bidxs, gko::int32 *cidxs,
+                                gko::int32 *refaidxs, gko::int32 *refbidxs,
+                                gko::int32 *refcidxs)
 {
-    sequential_merge_3way(
-        a, separator, a + separator, size - separator, b, size,
+    if (threadIdx.x == 0) {
+        sequential_merge(a, size, b, size,
+                         [&](int a_idx, gko::int32 a_val, int b_idx,
+                             gko::int32 b_val, int i) {
+                             refaidxs[i] = a_idx;
+                             refbidxs[i] = b_idx;
+                             refcidxs[i] = i;
+                         });
+    }
+    auto warp = tiled_partition<config::warp_size>(this_thread_block());
+    group_merge(
+        a, size, b, size, warp,
         [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            aidxs[i] = a_idx;
+            bidxs[i] = b_idx;
+            cidxs[i] = i;
             c[i] = min(a_val, b_val);
         });
 }
 
-TEST_F(Merging, SequentialFull3WayMerge)
+TEST_F(Merging, FullMergeIdxs)
 {
     for (auto i = 0; i < rng_runs; ++i) {
         init_data();
         for (auto size : sizes) {
-            for (auto separator :
-                 {0, 1, size / 3, 2 * size / 3, size - 1, size}) {
-                hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge_3way),
-                                   dim3(1), dim3(1), 0, 0,
-                                   ddata1.get_const_data(),
-                                   ddata2.get_const_data(), size, separator,
-                                   doutdata.get_data());
-
-                assert_eq_ref(size, 2 * size);
-            }
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_idxs), dim3(1),
+                               dim3(config::warp_size), 0, 0,
+                               ddata1.get_const_data(), ddata2.get_const_data(),
+                               size, doutdata.get_data(), didxs1.get_data(),
+                               didxs2.get_data(), didxs3.get_data(),
+                               drefidxs1.get_data(), drefidxs2.get_data(),
+                               drefidxs3.get_data());
+
+            assert_eq_ref(size, 2 * size);
+            idxs1 = didxs1;
+            idxs2 = didxs2;
+            idxs3 = didxs3;
+            refidxs1 = drefidxs1;
+            refidxs2 = drefidxs2;
+            refidxs3 = drefidxs3;
+            auto idxs1_ptr = idxs1.get_const_data();
+            auto idxs2_ptr = idxs2.get_const_data();
+            auto idxs3_ptr = idxs3.get_const_data();
+            auto refidxs1_ptr = refidxs1.get_const_data();
+            auto refidxs2_ptr = refidxs2.get_const_data();
+            auto refidxs3_ptr = refidxs3.get_const_data();
+
+            ASSERT_TRUE(
+                std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr));
+            ASSERT_TRUE(
+                std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr));
         }
     }
 }

From 82542f2012b7218fdf9586979e64a861b09cacc7 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 22 Jan 2020 17:23:18 +0100
Subject: [PATCH 5/8] unroll parallel merge loop by default

---
 common/components/merging.hpp.inc   | 20 +++++++-------------
 cuda/test/components/merging.cu     | 12 +++++++-----
 hip/test/components/merging.hip.cpp | 12 +++++++-----
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
index 368fb7a1ecf..a33dc0e3dc8 100644
--- a/common/components/merging.hpp.inc
+++ b/common/components/merging.hpp.inc
@@ -84,14 +84,11 @@ checked_load(const ValueType *p, IndexType i, IndexType size,
  * @param group  the cooperative group that executes the merge
  * @return  a structure containing the merge result distributed over the group.
  */
-template <typename ValueType, typename Group>
+template <int group_size, typename ValueType, typename Group>
 __device__ detail::merge_result<ValueType> group_merge_step(ValueType a,
                                                             ValueType b,
-                                                            int size,
                                                             Group group)
 {
-    // round up to the next power of two
-    auto size_pow2 = 1 << (32 - clz(uint32(size - 1)));
     // thread i takes care of ith element of the merged sequence
     auto i = int(group.thread_rank());
 
@@ -107,9 +104,7 @@ __device__ detail::merge_result<ValueType> group_merge_step(ValueType a,
     //   b[i - x] >= a[x - 1]
     //   and a[x] >= a[0...x - 1], b[0...i - x - 1]
     //     => merge a[x] with b[i - x]
-    auto minx = synchronous_binary_search(size_pow2, [&](int x) {
-        // potentially faster (because of unrolling):
-        // auto minx = synchronous_fixed_binary_search<group.size()>([&](int x){
+    auto minx = synchronous_fixed_binary_search<group_size>([&](int x) {
         auto a_remote = group.shfl(a, x);
         auto b_remote = group.shfl(b, max(i - x - 1, 0));
         return a_remote >= b_remote || x >= i;
@@ -144,8 +139,8 @@ __device__ detail::merge_result<ValueType> group_merge_step(ValueType a,
  *                   `*_val` and `*_idx` are the values resp. the indices of the
  *                   values from a/b being compared at output index `c_index`.
  */
-template <typename IndexType, typename ValueType, typename Group,
-          typename Callback>
+template <int group_size, typename IndexType, typename ValueType,
+          typename Group, typename Callback>
 __device__ void group_merge(const ValueType *a, IndexType a_size,
                             const ValueType *b, IndexType b_size, Group group,
                             Callback merge_fn)
@@ -158,8 +153,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
     auto a_cur = detail::checked_load(a, a_begin + lane, a_size);
     auto b_cur = detail::checked_load(b, b_begin + lane, a_size);
     while (c_begin < c_size) {
-        auto merge_size = min<int>(group.size(), c_size - c_begin);
-        auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group);
+        auto merge_result = group_merge_step<group_size>(a_cur, b_cur, group);
         if (c_begin + lane < c_size) {
             merge_fn(merge_result.a_idx + a_begin, merge_result.a_val,
                      merge_result.b_idx + b_begin, merge_result.b_val,
@@ -169,7 +163,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
         auto b_advance = merge_result.b_advance;
         a_begin += a_advance;
         b_begin += b_advance;
-        c_begin += group.size();
+        c_begin += group_size;
 
         // shuffle the unmerged elements to the front
         a_cur = group.shfl_down(a_cur, a_advance);
@@ -240,4 +234,4 @@ __device__ void sequential_merge(const ValueType *a, IndexType a_size,
         a_cur = a_advance ? loaded : a_cur;
         b_cur = b_advance ? loaded : b_cur;
     }
-}
\ No newline at end of file
+}
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 9b51aac04b1..1bd14ca584c 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -154,7 +154,7 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
 {
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
     auto i = warp.thread_rank();
-    auto result = group_merge_step(a[i], b[i], config::warp_size, warp);
+    auto result = group_merge_step<config::warp_size>(a[i], b[i], warp);
     c[i] = min(result.a_val, result.b_val);
 }
 
@@ -175,9 +175,11 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
                            gko::int32 *c)
 {
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge(a, size, b, size, warp,
-                [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val,
-                    int i) { c[i] = min(a_val, b_val); });
+    group_merge<config::warp_size>(
+        a, size, b, size, warp,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
 }
 
 TEST_F(Merging, FullMerge)
@@ -236,7 +238,7 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
                          });
     }
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge(
+    group_merge<config::warp_size>(
         a, size, b, size, warp,
         [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
             aidxs[i] = a_idx;
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index b281a5e8e1b..886a6713d82 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -158,7 +158,7 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
 {
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
     auto i = warp.thread_rank();
-    auto result = group_merge_step(a[i], b[i], config::warp_size, warp);
+    auto result = group_merge_step<config::warp_size>(a[i], b[i], warp);
     c[i] = min(result.a_val, result.b_val);
 }
 
@@ -180,9 +180,11 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
                            gko::int32 *c)
 {
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge(a, size, b, size, warp,
-                [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val,
-                    int i) { c[i] = min(a_val, b_val); });
+    group_merge<config::warp_size>(
+        a, size, b, size, warp,
+        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
+            c[i] = min(a_val, b_val);
+        });
 }
 
 TEST_F(Merging, FullMerge)
@@ -243,7 +245,7 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
                          });
     }
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge(
+    group_merge<config::warp_size>(
         a, size, b, size, warp,
         [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
             aidxs[i] = a_idx;

From f3be9d50d67b25b96cb13ec0d185a733b1ed66ca Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sun, 26 Jan 2020 21:33:15 +0100
Subject: [PATCH 6/8] review updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
---
 common/components/merging.hpp.inc     | 12 ++++++------
 common/components/searching.hpp.inc   | 11 ++++++++---
 cuda/components/merging.cuh           |  1 +
 cuda/components/searching.cuh         |  1 -
 cuda/test/components/merging.cu       | 27 +++++++++++++--------------
 cuda/test/components/searching.cu     |  9 +++++----
 hip/components/merging.hip.hpp        |  1 +
 hip/components/searching.hip.hpp      |  1 -
 hip/test/components/merging.hip.cpp   | 27 +++++++++++++--------------
 hip/test/components/searching.hip.cpp |  9 +++++----
 10 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
index a33dc0e3dc8..adbd60a4873 100644
--- a/common/components/merging.hpp.inc
+++ b/common/components/merging.hpp.inc
@@ -35,7 +35,7 @@ namespace detail {
 
 /**
  * @internal
- * The result from the @ref parallel_merge_step function.
+ * The result from the @ref group_merge_step function.
  */
 template <typename ValueType>
 struct merge_result {
@@ -71,8 +71,8 @@ checked_load(const ValueType *p, IndexType i, IndexType size,
  * Warp-parallel merge algorithm that merges the first `warp_size` elements from
  * two ranges, where each warp stores a single element from each range.
  * It assumes that the elements are sorted in ascending order, i.e. for i < j,
- * the value of `a` at thread i is smaller or equal the value at thread j, and
- * the same holds for `b`.
+ * the value of `a` at thread i is smaller or equal to the value at thread j,
+ * and the same holds for `b`.
  *
  * This implementation is based on ideas from  Green et al.,
  * "GPU merge path: a GPU merging algorithm", but uses random-access warp
@@ -115,7 +115,7 @@ __device__ detail::merge_result<ValueType> group_merge_step(ValueType a,
     auto a_val = group.shfl(a, a_idx);
     auto b_val = group.shfl(b, b_idx);
     auto cmp = a_val < b_val;
-    auto a_advance = int(popcnt(group.ballot(cmp)));
+    auto a_advance = popcnt(group.ballot(cmp));
     auto b_advance = int(group.size()) - a_advance;
 
     return {a_val, b_val, a_idx, b_idx, a_advance, b_advance};
@@ -125,7 +125,7 @@ __device__ detail::merge_result<ValueType> group_merge_step(ValueType a,
 /**
  * @internal
  * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary
- * size. `merge_fun` will be called for each merged element.
+ * size. `merge_fn` will be called for each merged element.
  *
  * @param a       the first range
  * @param a_size  the size of the first range
@@ -195,7 +195,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
 /**
  * @internal
  * Sequential merge algorithm that merges two sorted ranges of arbitrary
- * size. `merge_fun` will be called for each merged element.
+ * size. `merge_fn` will be called for each merged element.
  *
  * @param a  the first range
  * @param a_size the size of the first range
diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc
index ccf4bdc6f80..51e54525810 100644
--- a/common/components/searching.hpp.inc
+++ b/common/components/searching.hpp.inc
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * into two subranges [offset, middle), [middle, offset + length) such that
  * the predicate is `false` for all elements in the first range and `true` for
  * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
  * The implementation is based on Stepanov & McJones, "Elements of Programming".
  *
  * @param offset  the starting index of the partitioned range
@@ -69,6 +70,7 @@ __device__ IndexType binary_search(IndexType offset, IndexType length,
  * into two subranges [offset, middle), [middle, offset + length) such that
  * the predicate is `false` for all elements in the first range and `true` for
  * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
  *
  * @tparam size  the length of the partitioned range - must be a power of two
  * @param p  the predicate to be evaluated on the range - it should not have
@@ -81,7 +83,7 @@ __device__ int synchronous_fixed_binary_search(Predicate p)
     if (size == 0) {
         return 0;
     }
-    auto begin = 0;
+    int begin{};
     static_assert(size > 0, "size must be positive");
     static_assert(!(size & (size - 1)), "size must be a power of two");
 #pragma unroll
@@ -105,6 +107,7 @@ __device__ int synchronous_fixed_binary_search(Predicate p)
  * into two subranges [offset, middle), [middle, offset + length) such that
  * the predicate is `false` for all elements in the first range and `true` for
  * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
  *
  * @param size  the length of the partitioned range - must be a power of two
  * @param p  the predicate to be evaluated on the range - it should not have
@@ -117,7 +120,7 @@ __device__ int synchronous_binary_search(int size, Predicate p)
     if (size == 0) {
         return 0;
     }
-    auto begin = 0;
+    int begin{};
     for (auto cur_size = size; cur_size > 1; cur_size /= 2) {
         auto half_size = cur_size / 2;
         auto mid = begin + half_size;
@@ -136,6 +139,7 @@ __device__ int synchronous_binary_search(int size, Predicate p)
  * into two subranges [offset, middle), [middle, offset + length) such that
  * the predicate is `false` for all elements in the first range and `true` for
  * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
  *
  * It executes `log2(length / group.size())` coalescing calls to `p`.
  *
@@ -189,6 +193,7 @@ __device__ IndexType group_wide_search(IndexType offset, IndexType length,
  * into two subranges [offset, middle), [middle, offset + length) such that
  * the predicate is `false` for all elements in the first range and `true` for
  * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
  *
  * It executes `log2(length) / log2(group.size())` calls to `p` that effectively
  * follow a random-access pattern.
@@ -227,4 +232,4 @@ __device__ IndexType group_ary_search(IndexType offset, IndexType length,
     auto mask = group.ballot(idx >= end || p(idx));
     auto pos = mask == 0 ? group.size() : ffs(mask) - 1;
     return offset + pos;
-}
\ No newline at end of file
+}
diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh
index b1a2ffa0f21..c84ed260d4f 100644
--- a/cuda/components/merging.cuh
+++ b/cuda/components/merging.cuh
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CUDA_COMPONENTS_MERGING_CUH_
 
 
+#include "cuda/base/math.hpp"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/searching.cuh"
 
diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh
index 4ebeceba720..186123e04f3 100644
--- a/cuda/components/searching.cuh
+++ b/cuda/components/searching.cuh
@@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "cuda/base/config.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 
 
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 1bd14ca584c..28c09510cc9 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 #include <memory>
 #include <random>
+#include <vector>
 
 
 #include <gtest/gtest.h>
@@ -62,7 +63,6 @@ protected:
           cuda(gko::CudaExecutor::create(0, ref)),
           rng(123456),
           rng_runs{100},
-          rng_run{},
           max_size{1637},
           sizes{0,  1,  2,   3,   4,   10,  15,   16,
                 31, 34, 102, 242, 534, 956, 1239, 1637},
@@ -87,17 +87,17 @@ protected:
           doutdata(cuda, 2 * max_size)
     {}
 
-    void init_data()
+    void init_data(int rng_run)
     {
         std::uniform_int_distribution<gko::int32> dist(0, max_size);
         std::fill_n(data1.get_data(), max_size, 0);
         std::fill_n(data2.get_data(), max_size, 0);
-        for (auto i = 0; i < max_size; ++i) {
+        for (int i = 0; i < max_size; ++i) {
             // here we also want to test some corner cases
             // first two runs: zero data1
-            if (rng_run > 0) data1.get_data()[i] = dist(rng);
+            if (rng_run > 1) data1.get_data()[i] = dist(rng);
             // first and third run: zero data2
-            if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng);
+            if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng);
         }
         std::sort(data1.get_data(), data1.get_data() + max_size);
         std::sort(data2.get_data(), data2.get_data() + max_size);
@@ -124,7 +124,6 @@ protected:
     std::default_random_engine rng;
 
     int rng_runs;
-    int rng_run;
     int max_size;
     std::vector<int> sizes;
     gko::Array<gko::int32> data1;
@@ -160,8 +159,8 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
 
 TEST_F(Merging, MergeStep)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         test_merge_step<<<1, config::warp_size>>>(ddata1.get_const_data(),
                                                   ddata2.get_const_data(),
                                                   doutdata.get_data());
@@ -184,8 +183,8 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
 
 TEST_F(Merging, FullMerge)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         for (auto size : sizes) {
             test_merge<<<1, config::warp_size>>>(ddata1.get_const_data(),
                                                  ddata2.get_const_data(), size,
@@ -209,8 +208,8 @@ __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
 
 TEST_F(Merging, SequentialFullMerge)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         for (auto size : sizes) {
             test_sequential_merge<<<1, 1>>>(ddata1.get_const_data(),
                                             ddata2.get_const_data(), size,
@@ -250,8 +249,8 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
 
 TEST_F(Merging, FullMergeIdxs)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         for (auto size : sizes) {
             test_merge_idxs<<<1, config::warp_size>>>(
                 ddata1.get_const_data(), ddata2.get_const_data(), size,
diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu
index da9f89f48ee..80bba023655 100644
--- a/cuda/test/components/searching.cu
+++ b/cuda/test/components/searching.cu
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <memory>
 #include <numeric>
+#include <vector>
 
 
 #include <gtest/gtest.h>
@@ -97,7 +98,7 @@ __device__ void test_assert(bool *success, bool predicate)
 
 __global__ void test_binary_search(bool *success, int offset, int size)
 {
-    // test binary search on [0, size)
+    // test binary search on [offset, offset + size)
     // for all possible partition points
     auto result = binary_search(offset, size, [&](int i) {
         // don't access out-of-bounds!
@@ -147,7 +148,7 @@ TEST_F(Searching, BinaryEmptyOffset)
 
 __global__ void test_sync_binary_search(bool *success, int, int size)
 {
-    // test binary search on [0, warp_size)
+    // test binary search on [0, size)
     // for all possible partition points
     auto result = synchronous_binary_search(size, [&](int i) {
         // don't access out-of-bounds!
@@ -187,7 +188,7 @@ TEST_F(Searching, EmptySyncBinary)
 
 __global__ void test_warp_ary_search(bool *success, int offset, int size)
 {
-    // test binary search on [0, length)
+    // test binary search on [offset, offset + size)
     // for all possible partition points
     auto warp = group::tiled_partition<config::warp_size>(this_thread_block());
     auto result = group_ary_search(offset, size, warp, [&](int i) {
@@ -215,7 +216,7 @@ TEST_F(Searching, WarpAryOffset)
 
 __global__ void test_warp_wide_search(bool *success, int offset, int size)
 {
-    // test binary search on [0, length)
+    // test binary search on [offset, offset + size)
     // for all possible partition points
     auto warp = group::tiled_partition<config::warp_size>(this_thread_block());
     auto result = group_wide_search(offset, size, warp, [&](int i) {
diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp
index a236784d389..dde43a64885 100644
--- a/hip/components/merging.hip.hpp
+++ b/hip/components/merging.hip.hpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_HIP_COMPONENTS_MERGING_CUH_
 
 
+#include "hip/base/math.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/searching.hip.hpp"
 
diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp
index 8211121afa4..d2d7e909a12 100644
--- a/hip/components/searching.hip.hpp
+++ b/hip/components/searching.hip.hpp
@@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "hip/base/config.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 
 
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index 886a6713d82..fe3650fb504 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 #include <memory>
 #include <random>
+#include <vector>
 
 
 #include <gtest/gtest.h>
@@ -66,7 +67,6 @@ class Merging : public ::testing::Test {
           hip(gko::HipExecutor::create(0, ref)),
           rng(123456),
           rng_runs{100},
-          rng_run{},
           max_size{1637},
           sizes{0,  1,  2,   3,   4,   10,  15,   16,
                 31, 34, 102, 242, 534, 956, 1239, 1637},
@@ -91,17 +91,17 @@ class Merging : public ::testing::Test {
           doutdata(hip, 2 * max_size)
     {}
 
-    void init_data()
+    void init_data(int rng_run)
     {
         std::uniform_int_distribution<gko::int32> dist(0, max_size);
         std::fill_n(data1.get_data(), max_size, 0);
         std::fill_n(data2.get_data(), max_size, 0);
-        for (auto i = 0; i < max_size; ++i) {
+        for (int i = 0; i < max_size; ++i) {
             // here we also want to test some corner cases
             // first two runs: zero data1
-            if (rng_run > 0) data1.get_data()[i] = dist(rng);
+            if (rng_run > 1) data1.get_data()[i] = dist(rng);
             // first and third run: zero data2
-            if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng);
+            if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng);
         }
         std::sort(data1.get_data(), data1.get_data() + max_size);
         std::sort(data2.get_data(), data2.get_data() + max_size);
@@ -128,7 +128,6 @@ class Merging : public ::testing::Test {
     std::default_random_engine rng;
 
     int rng_runs;
-    int rng_run;
     int max_size;
     std::vector<int> sizes;
     gko::Array<gko::int32> data1;
@@ -164,8 +163,8 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b,
 
 TEST_F(Merging, MergeStep)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), dim3(1),
                            dim3(config::warp_size), 0, 0,
                            ddata1.get_const_data(), ddata2.get_const_data(),
@@ -189,8 +188,8 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
 
 TEST_F(Merging, FullMerge)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         for (auto size : sizes) {
             hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), dim3(1),
                                dim3(config::warp_size), 0, 0,
@@ -215,8 +214,8 @@ __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b,
 
 TEST_F(Merging, SequentialFullMerge)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         for (auto size : sizes) {
             hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), dim3(1),
                                dim3(1), 0, 0, ddata1.get_const_data(),
@@ -257,8 +256,8 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
 
 TEST_F(Merging, FullMergeIdxs)
 {
-    for (auto i = 0; i < rng_runs; ++i) {
-        init_data();
+    for (int i = 0; i < rng_runs; ++i) {
+        init_data(i);
         for (auto size : sizes) {
             hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_idxs), dim3(1),
                                dim3(config::warp_size), 0, 0,
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
index 53cf4fc6b53..4797a7d6726 100644
--- a/hip/test/components/searching.hip.cpp
+++ b/hip/test/components/searching.hip.cpp
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <memory>
 #include <numeric>
+#include <vector>
 
 
 #include <gtest/gtest.h>
@@ -102,7 +103,7 @@ __device__ void test_assert(bool *success, bool predicate)
 
 __global__ void test_binary_search(bool *success, int offset, int size)
 {
-    // test binary search on [0, size)
+    // test binary search on [offset, offset + size)
     // for all possible partition points
     auto result = binary_search(offset, size, [&](int i) {
         // don't access out-of-bounds!
@@ -152,7 +153,7 @@ TEST_F(Searching, BinaryEmptyOffset)
 
 __global__ void test_sync_binary_search(bool *success, int, int size)
 {
-    // test binary search on [0, warp_size)
+    // test binary search on [0, size)
     // for all possible partition points
     auto result = synchronous_binary_search(size, [&](int i) {
         // don't access out-of-bounds!
@@ -192,7 +193,7 @@ TEST_F(Searching, EmptySyncBinary)
 
 __global__ void test_warp_ary_search(bool *success, int offset, int size)
 {
-    // test binary search on [0, length)
+    // test binary search on [offset, offset + size)
     // for all possible partition points
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
     auto result = group_ary_search(offset, size, warp, [&](int i) {
@@ -220,7 +221,7 @@ TEST_F(Searching, WarpAryOffset)
 
 __global__ void test_warp_wide_search(bool *success, int offset, int size)
 {
-    // test binary search on [0, length)
+    // test binary search on [offset, offset + size)
     // for all possible partition points
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
     auto result = group_wide_search(offset, size, warp, [&](int i) {

From 27cfdb8571ae8ab4396eee2bcf193d768e925850 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 28 Jan 2020 14:24:32 +0100
Subject: [PATCH 7/8] improve merge callback interface

---
 common/components/intrinsics.hpp.inc | 12 +++++------
 common/components/merging.hpp.inc    | 20 +++++++++----------
 cuda/test/components/merging.cu      | 30 ++++++++++++++++------------
 hip/test/components/merging.hip.cpp  | 30 ++++++++++++++++------------
 4 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/common/components/intrinsics.hpp.inc b/common/components/intrinsics.hpp.inc
index ac88ab02c7f..5fc8b07e1d0 100644
--- a/common/components/intrinsics.hpp.inc
+++ b/common/components/intrinsics.hpp.inc
@@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * @internal
  * Returns the number of set bits in the given mask.
  */
-__device__ int popcnt(uint32 mask) { return __popc(mask); }
+inline __device__ int popcnt(uint32 mask) { return __popc(mask); }
 
 /** @copydoc popcnt */
-__device__ int popcnt(uint64 mask) { return __popcll(mask); }
+inline __device__ int popcnt(uint64 mask) { return __popcll(mask); }
 
 
 /**
@@ -45,10 +45,10 @@ __device__ int popcnt(uint64 mask) { return __popcll(mask); }
  * Returns the (1-based!) index of the first set bit in the given mask,
  * starting from the least significant bit.
  */
-__device__ int ffs(uint32 mask) { return __ffs(mask); }
+inline __device__ int ffs(uint32 mask) { return __ffs(mask); }
 
 /** @copydoc ffs */
-__device__ int ffs(uint64 mask)
+inline __device__ int ffs(uint64 mask)
 {
     // the cast is necessary, as the overloads defined by HIP are ambiguous
     return __ffsll(static_cast<unsigned long long int>(mask));
@@ -60,7 +60,7 @@ __device__ int ffs(uint64 mask)
  * Returns the number of zero bits before the first set bit in the given mask,
  * starting from the most significant bit.
  */
-__device__ int clz(uint32 mask) { return __clz(mask); }
+inline __device__ int clz(uint32 mask) { return __clz(mask); }
 
 /** @copydoc clz */
-__device__ int clz(uint64 mask) { return __clzll(mask); }
+inline __device__ int clz(uint64 mask) { return __clzll(mask); }
diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
index adbd60a4873..fd27f9e61b6 100644
--- a/common/components/merging.hpp.inc
+++ b/common/components/merging.hpp.inc
@@ -133,11 +133,13 @@ __device__ detail::merge_result<ValueType> group_merge_step(ValueType a,
  * @param b_size  the size of the second range
  * @param group   the group that executes the merge
  * @param merge_fn  the callback that is being called for each merged element.
- *                  It takes five parameters:
- *                  `IndexType a_idx, ValueType a_val,
- *                   IndexType b_idx, ValueType b_val, IndexType c_index`.
- *                   `*_val` and `*_idx` are the values resp. the indices of the
- *                   values from a/b being compared at output index `c_index`.
+ *                  It takes six parameters:
+ *                  `IndexType a_idx, ValueType a_val, IndexType b_idx,
+ *                   ValueType b_val, IndexType c_index, bool valid`.
+ *                  `*_val` and `*_idx` are the values resp. the indices of the
+ *                  values from a/b being compared at output index `c_index`.
+ *                  `valid` specifies if the current thread has to merge an
+ *                  element (this is necessary for shfl and ballot operations).
  */
 template <int group_size, typename IndexType, typename ValueType,
           typename Group, typename Callback>
@@ -154,11 +156,9 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
     auto b_cur = detail::checked_load(b, b_begin + lane, a_size);
     while (c_begin < c_size) {
         auto merge_result = group_merge_step<group_size>(a_cur, b_cur, group);
-        if (c_begin + lane < c_size) {
-            merge_fn(merge_result.a_idx + a_begin, merge_result.a_val,
-                     merge_result.b_idx + b_begin, merge_result.b_val,
-                     c_begin + lane);
-        }
+        merge_fn(merge_result.a_idx + a_begin, merge_result.a_val,
+                 merge_result.b_idx + b_begin, merge_result.b_val,
+                 c_begin + lane, c_begin + lane < c_size);
         auto a_advance = merge_result.a_advance;
         auto b_advance = merge_result.b_advance;
         a_begin += a_advance;
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 28c09510cc9..450785325fe 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -174,11 +174,13 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
                            gko::int32 *c)
 {
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge<config::warp_size>(
-        a, size, b, size, warp,
-        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
-            c[i] = min(a_val, b_val);
-        });
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                   });
 }
 
 TEST_F(Merging, FullMerge)
@@ -237,14 +239,16 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
                          });
     }
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge<config::warp_size>(
-        a, size, b, size, warp,
-        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
-            aidxs[i] = a_idx;
-            bidxs[i] = b_idx;
-            cidxs[i] = i;
-            c[i] = min(a_val, b_val);
-        });
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           aidxs[i] = a_idx;
+                                           bidxs[i] = b_idx;
+                                           cidxs[i] = i;
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                   });
 }
 
 TEST_F(Merging, FullMergeIdxs)
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index fe3650fb504..cde8a89c218 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -179,11 +179,13 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size,
                            gko::int32 *c)
 {
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge<config::warp_size>(
-        a, size, b, size, warp,
-        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
-            c[i] = min(a_val, b_val);
-        });
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                   });
 }
 
 TEST_F(Merging, FullMerge)
@@ -244,14 +246,16 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b,
                          });
     }
     auto warp = tiled_partition<config::warp_size>(this_thread_block());
-    group_merge<config::warp_size>(
-        a, size, b, size, warp,
-        [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) {
-            aidxs[i] = a_idx;
-            bidxs[i] = b_idx;
-            cidxs[i] = i;
-            c[i] = min(a_val, b_val);
-        });
+    group_merge<config::warp_size>(a, size, b, size, warp,
+                                   [&](int a_idx, gko::int32 a_val, int b_idx,
+                                       gko::int32 b_val, int i, bool valid) {
+                                       if (valid) {
+                                           aidxs[i] = a_idx;
+                                           bidxs[i] = b_idx;
+                                           cidxs[i] = i;
+                                           c[i] = min(a_val, b_val);
+                                       }
+                                   });
 }
 
 TEST_F(Merging, FullMergeIdxs)

From 142d8917fd5f3efc1ae01d42ab53219aed04f142 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 28 Jan 2020 15:25:38 +0100
Subject: [PATCH 8/8] replace inline by __forceinline__ for intrinsics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 common/components/intrinsics.hpp.inc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/components/intrinsics.hpp.inc b/common/components/intrinsics.hpp.inc
index 5fc8b07e1d0..f89fa434eb4 100644
--- a/common/components/intrinsics.hpp.inc
+++ b/common/components/intrinsics.hpp.inc
@@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * @internal
  * Returns the number of set bits in the given mask.
  */
-inline __device__ int popcnt(uint32 mask) { return __popc(mask); }
+__forceinline__ __device__ int popcnt(uint32 mask) { return __popc(mask); }
 
 /** @copydoc popcnt */
-inline __device__ int popcnt(uint64 mask) { return __popcll(mask); }
+__forceinline__ __device__ int popcnt(uint64 mask) { return __popcll(mask); }
 
 
 /**
@@ -45,10 +45,10 @@ inline __device__ int popcnt(uint64 mask) { return __popcll(mask); }
  * Returns the (1-based!) index of the first set bit in the given mask,
  * starting from the least significant bit.
  */
-inline __device__ int ffs(uint32 mask) { return __ffs(mask); }
+__forceinline__ __device__ int ffs(uint32 mask) { return __ffs(mask); }
 
 /** @copydoc ffs */
-inline __device__ int ffs(uint64 mask)
+__forceinline__ __device__ int ffs(uint64 mask)
 {
     // the cast is necessary, as the overloads defined by HIP are ambiguous
     return __ffsll(static_cast<unsigned long long int>(mask));
@@ -60,7 +60,7 @@ inline __device__ int ffs(uint64 mask)
  * Returns the number of zero bits before the first set bit in the given mask,
  * starting from the most significant bit.
  */
-inline __device__ int clz(uint32 mask) { return __clz(mask); }
+__forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); }
 
 /** @copydoc clz */
-inline __device__ int clz(uint64 mask) { return __clzll(mask); }
+__forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); }