From fba361a86366d78c3db9f63e177a52218dcdcd6f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Jan 2020 15:54:49 +0100 Subject: [PATCH 1/8] add searching kernels and tests --- common/components/searching.hpp.inc | 224 +++++++++++++++++++++++ cuda/components/searching.cuh | 55 ++++++ cuda/test/components/CMakeLists.txt | 1 + cuda/test/components/searching.cu | 244 +++++++++++++++++++++++++ hip/components/searching.hip.hpp | 55 ++++++ hip/test/components/CMakeLists.txt | 1 + hip/test/components/searching.hip.cpp | 249 ++++++++++++++++++++++++++ 7 files changed, 829 insertions(+) create mode 100644 common/components/searching.hpp.inc create mode 100644 cuda/components/searching.cuh create mode 100644 cuda/test/components/searching.cu create mode 100644 hip/components/searching.hip.hpp create mode 100644 hip/test/components/searching.hip.cpp diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc new file mode 100644 index 00000000000..222ab85de33 --- /dev/null +++ b/common/components/searching.hpp.inc @@ -0,0 +1,224 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @internal + * Generic binary search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * The implementation is based on Stepanov & McJones, "Elements of Programming". + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__device__ IndexType binary_search(IndexType offset, IndexType length, + Predicate p) +{ + while (length > 0) { + auto half_length = length / 2; + auto mid = offset + half_length; + auto pred = p(mid); + length = pred ? half_length : length - (half_length + 1); + offset = pred ? offset : mid + 1; + } + return offset; +} + + +/** + * @internal + * Generic implementation of a fixed-size binary search. + * The implementation makes sure that the number of predicate evaluations only + * depends on `length` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * + * @tparam size the length of the partitioned range - must be a power of two + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `int` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__device__ int synchronous_fixed_binary_search(Predicate p) +{ + if (size == 0) { + return 0; + } + auto begin = 0; + static_assert(size > 0, "size must be positive"); + static_assert(!(size & (size - 1)), "size must be a power of two"); +#pragma unroll + for (auto cur_size = size; cur_size > 1; cur_size /= 2) { + auto half_size = cur_size / 2; + auto mid = begin + half_size; + // invariant: [begin, begin + cur_size] contains partition point + begin = p(mid) ? begin : mid; + } + // cur_size is now 1, so the partition point is either begin or begin + 1 + return p(begin) ? begin : begin + 1; +} + + +/** + * @internal + * Generic implementation of a synchronous binary search. + * The implementation makes sure that the number of predicate evaluations only + * depends on `length` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * + * @param size the length of the partitioned range - must be a power of two + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `int` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__device__ int synchronous_binary_search(int size, Predicate p) +{ + if (size == 0) { + return 0; + } + auto begin = 0; + for (auto cur_size = size; cur_size > 1; cur_size /= 2) { + auto half_size = cur_size / 2; + auto mid = begin + half_size; + // invariant: [begin, begin + cur_size] contains partition point + begin = p(mid) ? begin : mid; + } + // cur_size is now 1, so the partition point is either begin or begin + 1 + return p(begin) ? begin : begin + 1; +} + + +/** + * @internal + * Generic search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * + * It executes `log2(length / group.size())` coalescing calls to `p`. + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param group the coalescing group executing the search + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__device__ IndexType group_wide_search(IndexType offset, IndexType length, + Group group, Predicate p) +{ + // binary search on the group-sized blocks + IndexType num_blocks = (length + group.size() - 1) / group.size(); + auto group_pos = binary_search(IndexType{}, num_blocks, [&](IndexType i) { + auto idx = i * group.size(); + return p(offset + idx); + }); + // case 1: p is true everywhere: middle is at the beginning + if (group_pos == 0) { + return offset; + } + /* + * case 2: p is false somewhere: + * + * p(group_pos * g.size()) is true, so either this is the partition point, + * or the partition point is one of the g.size() - 1 previous indices. + * |block group_pos-1| + * 0 | 0 * * * * * * * | 1 + * ^ ^ + * we load this range, with the 1 acting as a sentinel for ffs(...) + * + * additionally, this means that we can't call p out-of-bounds + */ + auto base_idx = (group_pos - 1) * group.size() + 1; + auto idx = base_idx + group.thread_rank(); + auto pos = ffs(group.ballot(idx >= length || p(offset + idx))) - 1; + return offset + base_idx + pos; +} + + +/** + * @internal + * Generic search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * + * It executes `log2(length) / log2(group.size())` calls to `p` that effectively + * follow a random-access pattern. + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param group the coalescing group executing the search + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__device__ IndexType group_ary_search(IndexType offset, IndexType length, + Group group, Predicate p) +{ + IndexType end = offset + length; + // invariant: [offset, offset + length] contains middle + while (length > group.size()) { + auto stride = length / group.size(); + auto idx = offset + group.thread_rank() * stride; + auto mask = group.ballot(p(idx)); + // if the mask is 0, the partition point is in the last block + // if the mask is ~0, the partition point is in the first block + // otherwise, we go to the last block that returned a 0. + auto pos = mask == 0 ? group.size() - 1 : ffs(mask >> 1) - 1; + auto last_length = length - stride * (group.size() - 1); + length = pos == group.size() - 1 ? last_length : stride; + offset += stride * pos; + } + auto idx = offset + group.thread_rank(); + // if the mask is 0, the partition point is at the end + // otherwise it is the first set bit + auto mask = group.ballot(idx >= end || p(idx)); + auto pos = mask == 0 ? group.size() : ffs(mask) - 1; + return offset + pos; +} \ No newline at end of file diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh new file mode 100644 index 00000000000..4ebeceba720 --- /dev/null +++ b/cuda/components/searching.cuh @@ -0,0 +1,55 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_ +#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_ + + +#include "cuda/base/config.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { + + +#include "common/components/searching.hpp.inc" + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_COMPONENTS_SEARCHING_CUH_ diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt index a4110055031..da1781d80db 100644 --- a/cuda/test/components/CMakeLists.txt +++ b/cuda/test/components/CMakeLists.txt @@ -1,3 +1,4 @@ ginkgo_create_cuda_test(cooperative_groups) ginkgo_create_cuda_test(prefix_sum) +ginkgo_create_cuda_test(searching) ginkgo_create_cuda_test(sorting) \ No newline at end of file diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu new file mode 100644 index 00000000000..da9f89f48ee --- /dev/null +++ b/cuda/test/components/searching.cu @@ -0,0 +1,244 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/components/searching.cuh" + + +#include +#include + + +#include + + +#include +#include + + +#include "cuda/components/cooperative_groups.cuh" + + +namespace { + + +using namespace gko::kernels::cuda; +using cooperative_groups::this_thread_block; + + +class Searching : public ::testing::Test { +protected: + Searching() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)), + result(ref, 1), + dresult(cuda), + sizes(14203) + { + std::iota(sizes.begin(), sizes.end(), 0); + } + + template + void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1) + { + *result.get_data() = true; + dresult = result; + kernel<<>>(dresult.get_data(), offset, + size); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + gko::Array result; + gko::Array dresult; + std::vector sizes; +}; + + +__device__ void test_assert(bool *success, bool predicate) +{ + if (!predicate) { + *success = false; + } +} + + +__global__ void test_binary_search(bool *success, int offset, int size) +{ + // test binary search on [0, size) + // for all possible partition points + auto result = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset; + }); + auto result2 = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset + 1; + }); + test_assert(success, result == threadIdx.x + offset); + test_assert(success, result2 == threadIdx.x + offset + 1); +} + +TEST_F(Searching, BinaryNoOffset) +{ + run_test(test_binary_search, 0, config::warp_size); +} + +TEST_F(Searching, BinaryOffset) +{ + run_test(test_binary_search, 5, config::warp_size); +} + + +__global__ void test_empty_binary_search(bool *success, int offset, int) +{ + auto result = binary_search(offset, 0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == offset); +} + +TEST_F(Searching, BinaryEmptyNoOffset) +{ + run_test(test_empty_binary_search, 0, 0); +} + +TEST_F(Searching, BinaryEmptyOffset) +{ + run_test(test_empty_binary_search, 5, 0); +} + + +__global__ void test_sync_binary_search(bool *success, int, int size) +{ + // test binary search on [0, warp_size) + // for all possible partition points + auto result = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x; + }); + auto result2 = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x + 1; + }); + test_assert(success, result == threadIdx.x); + test_assert(success, result2 == threadIdx.x + 1); +} + +TEST_F(Searching, SyncBinary) +{ + run_test(test_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_empty_sync_binary_search(bool *success, int, int) +{ + auto result = synchronous_binary_search(0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == 0); +} + +TEST_F(Searching, EmptySyncBinary) +{ + run_test(test_empty_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_warp_ary_search(bool *success, int offset, int size) +{ + // test binary search on [0, length) + // for all possible partition points + auto warp = group::tiled_partition(this_thread_block()); + auto result = group_ary_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpAryNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpAryOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 134, size, size + 1); + } +} + + +__global__ void test_warp_wide_search(bool *success, int offset, int size) +{ + // test binary search on [0, length) + // for all possible partition points + auto warp = group::tiled_partition(this_thread_block()); + auto result = group_wide_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpWideNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpWideOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 142, size, size + 1); + } +} + + +} // namespace diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp new file mode 100644 index 00000000000..8211121afa4 --- /dev/null +++ b/hip/components/searching.hip.hpp @@ -0,0 +1,55 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_SEARCHING_CUH_ +#define GKO_HIP_COMPONENTS_SEARCHING_CUH_ + + +#include "hip/base/config.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/searching.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_SEARCHING_CUH_ diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt index 2783af3fbbc..c0fbe34f6aa 100644 --- a/hip/test/components/CMakeLists.txt +++ b/hip/test/components/CMakeLists.txt @@ -1,3 +1,4 @@ ginkgo_create_hip_test(cooperative_groups) ginkgo_create_hip_test(prefix_sum) +ginkgo_create_hip_test(searching) ginkgo_create_hip_test(sorting) \ No newline at end of file diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp new file mode 100644 index 00000000000..53cf4fc6b53 --- /dev/null +++ b/hip/test/components/searching.hip.cpp @@ -0,0 +1,249 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// TODO remove when the HIP includes are fixed +#include + + +#include "hip/components/searching.hip.hpp" + + +#include +#include + + +#include + + +#include +#include + + +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace { + + +using namespace gko::kernels::hip; +using namespace gko::kernels::hip::group; + + +class Searching : public ::testing::Test { +protected: + Searching() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)), + result(ref, 1), + dresult(hip), + sizes(14203) + { + std::iota(sizes.begin(), sizes.end(), 0); + } + + template + void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1) + { + *result.get_data() = true; + dresult = result; + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(num_blocks), + dim3(config::warp_size), 0, 0, dresult.get_data(), + offset, size); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + std::shared_ptr ref; + std::shared_ptr hip; + gko::Array result; + gko::Array dresult; + std::vector sizes; +}; + + +__device__ void test_assert(bool *success, bool predicate) +{ + if (!predicate) { + *success = false; + } +} + + +__global__ void test_binary_search(bool *success, int offset, int size) +{ + // test binary search on [0, size) + // for all possible partition points + auto result = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset; + }); + auto result2 = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset + 1; + }); + test_assert(success, result == threadIdx.x + offset); + test_assert(success, result2 == threadIdx.x + offset + 1); +} + +TEST_F(Searching, BinaryNoOffset) +{ + run_test(test_binary_search, 0, config::warp_size); +} + +TEST_F(Searching, BinaryOffset) +{ + run_test(test_binary_search, 5, config::warp_size); +} + + +__global__ void test_empty_binary_search(bool *success, int offset, int) +{ + auto result = binary_search(offset, 0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == offset); +} + +TEST_F(Searching, BinaryEmptyNoOffset) +{ + run_test(test_empty_binary_search, 0, 0); +} + +TEST_F(Searching, BinaryEmptyOffset) +{ + run_test(test_empty_binary_search, 5, 0); +} + + +__global__ void test_sync_binary_search(bool *success, int, int size) +{ + // test binary search on [0, warp_size) + // for all possible partition points + auto result = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x; + }); + auto result2 = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x + 1; + }); + test_assert(success, result == threadIdx.x); + test_assert(success, result2 == threadIdx.x + 1); +} + +TEST_F(Searching, SyncBinary) +{ + run_test(test_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_empty_sync_binary_search(bool *success, int, int) +{ + auto result = synchronous_binary_search(0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == 0); +} + +TEST_F(Searching, EmptySyncBinary) +{ + run_test(test_empty_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_warp_ary_search(bool *success, int offset, int size) +{ + // test binary search on [0, length) + // for all possible partition points + auto warp = tiled_partition(this_thread_block()); + auto result = group_ary_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpAryNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpAryOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 134, size, size + 1); + } +} + + +__global__ void test_warp_wide_search(bool *success, int offset, int size) +{ + // test binary search on [0, length) + // for all possible partition points + auto warp = tiled_partition(this_thread_block()); + auto result = group_wide_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpWideNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpWideOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 142, size, size + 1); + } +} + + +} // namespace From 94bc786080d92cf8e8fffbc4ebd64c13e96f9d93 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 19 Dec 2019 08:45:57 +0100 Subject: [PATCH 2/8] add merging algorithms and tests --- common/components/merging.hpp.inc | 373 ++++++++++++++++++++++++++++ cuda/components/merging.cuh | 54 ++++ cuda/test/components/CMakeLists.txt | 1 + cuda/test/components/merging.cu | 257 +++++++++++++++++++ hip/components/merging.hip.hpp | 54 ++++ hip/test/components/CMakeLists.txt | 1 + hip/test/components/merging.hip.cpp | 268 ++++++++++++++++++++ 7 files changed, 1008 insertions(+) create mode 100644 common/components/merging.hpp.inc create mode 100644 cuda/components/merging.cuh create mode 100644 cuda/test/components/merging.cu create mode 100644 hip/components/merging.hip.hpp create mode 100644 hip/test/components/merging.hip.cpp diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc new file mode 100644 index 00000000000..59a9b84a245 --- /dev/null +++ b/common/components/merging.hpp.inc @@ -0,0 +1,373 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace detail { + + +/** + * @internal + * The result from the @ref parallel_merge_step function. + */ +template +struct merge_result { + /** The element of a being merged in the current thread. */ + ValueType a_val; + /** The element of b being merged in the current thread. */ + ValueType b_val; + /** The index from a that is being merged in the current thread. */ + int a_idx; + /** The index from b that is being merged in the current thread. */ + int b_idx; + /** The number of elements from a that have been merged in total. */ + int a_advance; + /** The number of elements from b that have been merged in total. */ + int b_advance; +}; + + +template +__device__ ValueType +checked_load(const ValueType *p, IndexType i, IndexType size, + ValueType sentinel = device_numeric_limits::max) +{ + return i < size ? p[i] : sentinel; +} + + +} // namespace detail + + +/** + * @internal + * Warp-parallel merge algorithm that merges the first `warp_size` elements from + * two ranges, where each warp stores a single element from each range. + * It assumes that the elements are sorted in ascending order, i.e. for i < j, + * the value of `a` at thread i is smaller or equal the value at thread j, and + * the same holds for `b`. + * + * @param a the element from the first range + * @param b the element from the second range + * @param size the number of elements in the output range + * @param group the cooperative group that executes the merge + * @return a structure containing the merge result distributed over the group. + */ +template +__device__ detail::merge_result group_merge_step(ValueType a, + ValueType b, + int size, + Group group) +{ + // round up to the next power of two + auto size_pow2 = 1 << (32 - clz(uint32(size - 1))); + // thread i takes care of ith element of the merged sequence + auto i = int(group.thread_rank()); + + // we want to find the smallest index `x` such that a[x] >= b[i - x - 1] + // or `i` if no such index exists + // + // if x = i then c[0...i - 1] = a[0...i - 1] + // => merge a[i] with b[0] + // if x = 0 then c[0...i - 1] = b[0...i - 1] + // => merge a[0] with b[i] + // otherwise c[0...i - 1] contains a[0...x - 1] and b[0...i - x - 1] + // because the minimality of `x` implies + // b[i - x] >= a[x - 1] + // and a[x] >= a[0...x - 1], b[0...i - x - 1] + // => merge a[x] with b[i - x] + auto minx = synchronous_binary_search(size_pow2, [&](int x) { + // potentially faster (because of unrolling): + // auto minx = synchronous_fixed_binary_search([&](int x){ + auto a_remote = group.shfl(a, x); + auto b_remote = group.shfl(b, max(i - x - 1, 0)); + return a_remote >= b_remote || x >= i; + }); + + auto a_idx = minx; + auto b_idx = max(i - minx, 0); + auto a_val = group.shfl(a, a_idx); + auto b_val = group.shfl(b, b_idx); + auto cmp = a_val < b_val; + auto a_advance = int(popcnt(group.ballot(cmp))); + auto b_advance = int(group.size()) - a_advance; + + return {a_val, b_val, a_idx, b_idx, a_advance, b_advance}; +} + + +/** + * @internal + * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary + * size. `merge_fun` will be called for each merged element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param group the group that executes the merge + * @param merge_fn the callback that is being called for each merged element. + * It takes five parameters: + * `IndexType a_idx, ValueType a_val, + * IndexType b_idx, ValueType b_val, IndexType c_index`. + * `*_val` and `*_idx` are the values resp. the indices of the + * values from a/b being compared at output index `c_index`. + */ +template +__device__ void group_merge(const ValueType *a, IndexType a_size, + const ValueType *b, IndexType b_size, Group group, + Callback merge_fn) +{ + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + IndexType c_begin{}; + auto lane = IndexType(group.thread_rank()); + auto a_cur = detail::checked_load(a, a_begin + lane, a_size); + auto b_cur = detail::checked_load(b, b_begin + lane, a_size); + while (c_begin < c_size) { + auto merge_size = min(group.size(), c_size - c_begin); + auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group); + if (c_begin + lane < c_size) { + merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx, + merge_result.b_val, c_begin + lane); + } + auto a_advance = merge_result.a_advance; + auto b_advance = merge_result.b_advance; + a_begin += a_advance; + b_begin += b_advance; + c_begin += group.size(); + + // shuffle the unmerged elements to the front + a_cur = group.shfl_down(a_cur, a_advance); + b_cur = group.shfl_down(b_cur, b_advance); + /* + * To optimize memory access, we load the new elements for `a` and `b` + * with a single load instruction: + * the lower part of the group loads new elements for `a` + * the upper part of the group loads new elements for `b` + * `load_lane` is the part-local lane idx + * The elements for `a` have to be shuffled up afterwards. + */ + auto load_a = lane < a_advance; + auto load_lane = load_a ? lane : lane - a_advance; + auto load_source = load_a ? a : b; + auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance; + auto load_size = load_a ? a_size : b_size; + + auto load_idx = load_begin + load_lane; + auto loaded = detail::checked_load(load_source, load_idx, load_size); + // shuffle the `a` values to the end of the warp + auto lower_loaded = group.shfl_up(loaded, b_advance); + a_cur = lane < b_advance ? a_cur : lower_loaded; + b_cur = lane < a_advance ? b_cur : loaded; + } +} + + +/** + * @internal + * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary + * size, where the first range is stored in two halves. + * `merge_fun` will be called for each merged element. + * + * @param a1 the first half of the first range + * @param a1_begin the beginning offset of the first half of the first range + * @param a1_size the size of the first half of the first range. + * @param a2 the second half of the first range + * @param a2_begin the beginning offset of the second half of the first range + * @param a2_size the size of the second half of the first range. + * @param b the second range + * @param b_size the size of the second range + * @param group the group that executes the merge + * @param merge_fn the callback that is being called for each merged element. + * It takes five parameters: + * `IndexType a_idx, ValueType a_val, + * IndexType b_idx, ValueType b_val, IndexType c_index`. + * `*_val` and `*_idx` are the values resp. the indices of the + * values from a/b being compared at output index `c_index`. + */ +template +__device__ void group_merge_3way(const ValueType *a1, IndexType a1_size, + const ValueType *a2, IndexType a2_size, + const ValueType *b, IndexType b_size, + Group group, Callback merge_fn) +{ + auto a_size = a1_size + a2_size; + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + IndexType c_begin{}; + auto lane = IndexType(group.thread_rank()); + auto a = lane < a1_size ? a1 : a2 - a1_size; + auto a_cur = detail::checked_load(a, a_begin + lane, a_size); + auto b_cur = detail::checked_load(b, b_begin + lane, a_size); + while (c_begin < c_size) { + auto merge_size = min(group.size(), c_size - c_begin); + auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group); + if (c_begin + lane < c_size) { + merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx, + merge_result.b_val, c_begin + lane); + } + auto a_advance = merge_result.a_advance; + auto b_advance = merge_result.b_advance; + a_begin += a_advance; + b_begin += b_advance; + c_begin += group.size(); + a = a_begin + lane < a1_size ? a1 : a2 - a1_size; + + // shuffle the unmerged elements to the front + a_cur = group.shfl_down(a_cur, a_advance); + b_cur = group.shfl_down(b_cur, b_advance); + /* + * To optimize memory access, we load the new elements for `a` and `b` + * with a single load instruction: + * the lower part of the group loads new elements for `a` + * the upper part of the group loads new elements for `b` + * `load_lane` is the part-local lane idx + * The elements for `a` have to be shuffled up afterwards. + */ + auto load_a = lane < a_advance; + auto load_lane = load_a ? lane : lane - a_advance; + auto load_source = load_a ? a : b; + auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance; + auto load_size = load_a ? a_size : b_size; + + auto load_idx = load_begin + load_lane; + auto loaded = detail::checked_load(load_source, load_idx, load_size); + // shuffle the `a` values to the end of the warp + auto lower_loaded = group.shfl_up(loaded, b_advance); + a_cur = lane < b_advance ? a_cur : lower_loaded; + b_cur = lane < a_advance ? b_cur : loaded; + } +} + + +/** + * @internal + * Sequential merge algorithm that merges two sorted ranges of arbitrary + * size. `merge_fun` will be called for each merged element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param merge_fn the callback that is being called for each merged element. + * It takes five parameters: + * `IndexType a_idx, ValueType a_val, + * IndexType b_idx, ValueType b_val, IndexType c_index`. + * `*_val` and `*_idx` are the values resp. the indices of the + * values from a/b being compared at output index `c_index`. + */ +template +__device__ void sequential_merge(const ValueType *a, IndexType a_size, + const ValueType *b, IndexType b_size, + Callback merge_fn) +{ + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + IndexType c_begin{}; + auto a_cur = detail::checked_load(a, a_begin, a_size); + auto b_cur = detail::checked_load(b, b_begin, b_size); + while (c_begin < c_size) { + merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin); + auto a_advance = a_cur < b_cur; + auto b_advance = !a_advance; + a_begin += a_advance; + b_begin += b_advance; + c_begin++; + + auto load = a_advance ? a : b; + auto load_size = a_advance ? a_size : b_size; + auto load_idx = a_advance ? a_begin : b_begin; + auto loaded = detail::checked_load(load, load_idx, load_size); + a_cur = a_advance ? loaded : a_cur; + b_cur = b_advance ? loaded : b_cur; + } +} + + +/** + * @internal + * Sequential merge algorithm that merges two sorted ranges of arbitrary + * size, where the first range is stored in two halves. + * `merge_fun` will be called for each merged element. + * + * @param a1 the first half of the first range + * @param a1_begin the beginning offset of the first half of the first range + * @param a1_size the size of the first half of the first range. + * @param a2 the second half of the first range + * @param a2_begin the beginning offset of the second half of the first range + * @param a2_size the size of the second half of the first range. + * @param b the second range + * @param b_size the size of the second range + * @param merge_fn the callback that is being called for each merged element. + * It takes five parameters: + * `IndexType a_idx, ValueType a_val, + * IndexType b_idx, ValueType b_val, IndexType c_index`. + * `*_val` and `*_idx` are the values resp. the indices of the + * values from a/b being compared at output index `c_index`. + */ +template +__device__ void sequential_merge_3way(const ValueType *a1, IndexType a1_size, + const ValueType *a2, IndexType a2_size, + const ValueType *b, IndexType b_size, + Callback merge_fn) +{ + auto a = a1_size > 0 ? a1 : a2; + auto a_size = a1_size + a2_size; + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + IndexType c_begin{}; + auto a_cur = detail::checked_load(a, a_begin, a_size); + auto b_cur = detail::checked_load(b, b_begin, b_size); + while (c_begin < c_size) { + merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin); + auto a_advance = a_cur < b_cur; + auto b_advance = !a_advance; + a_begin += a_advance; + b_begin += b_advance; + c_begin++; + if (a_begin == a1_size) { + a = a2 - a1_size; + } + + auto load = a_advance ? a : b; + auto load_size = a_advance ? a_size : b_size; + auto load_idx = a_advance ? a_begin : b_begin; + auto loaded = detail::checked_load(load, load_idx, load_size); + a_cur = a_advance ? loaded : a_cur; + b_cur = b_advance ? loaded : b_cur; + } +} \ No newline at end of file diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh new file mode 100644 index 00000000000..b1a2ffa0f21 --- /dev/null +++ b/cuda/components/merging.cuh @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_ +#define GKO_CUDA_COMPONENTS_MERGING_CUH_ + + +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/searching.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { + + +#include "common/components/merging.hpp.inc" + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_COMPONENTS_MERGING_CUH_ diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt index da1781d80db..73d7c90aebd 100644 --- a/cuda/test/components/CMakeLists.txt +++ b/cuda/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_cuda_test(cooperative_groups) +ginkgo_create_cuda_test(merging) ginkgo_create_cuda_test(prefix_sum) ginkgo_create_cuda_test(searching) ginkgo_create_cuda_test(sorting) \ No newline at end of file diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu new file mode 100644 index 00000000000..9b8123a960d --- /dev/null +++ b/cuda/test/components/merging.cu @@ -0,0 +1,257 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/components/merging.cuh" + + +#include +#include +#include + + +#include + + +#include +#include + + +#include "cuda/components/cooperative_groups.cuh" + + +namespace { + + +using namespace gko::kernels::cuda; +using namespace cooperative_groups; + + +class Merging : public ::testing::Test { +protected: + Merging() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)), + rng(123456), + rng_runs{100}, + rng_run{}, + max_size{1637}, + sizes{0, 1, 2, 3, 4, 10, 15, 16, + 31, 34, 102, 242, 534, 956, 1239, 1637}, + data1(ref, max_size), + data2(ref, max_size), + outdata(ref, 2 * max_size), + refdata(ref, 2 * max_size), + ddata1(cuda), + ddata2(cuda), + doutdata(cuda, 2 * max_size) + {} + + void init_data() + { + std::uniform_int_distribution dist(0, max_size); + std::fill_n(data1.get_data(), max_size, 0); + std::fill_n(data2.get_data(), max_size, 0); + for (auto i = 0; i < max_size; ++i) { + // here we also want to test some corner cases + // first two runs: zero data1 + if (rng_run > 0) data1.get_data()[i] = dist(rng); + // first and third run: zero data2 + if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng); + } + std::sort(data1.get_data(), data1.get_data() + max_size); + std::sort(data2.get_data(), data2.get_data() + max_size); + + ddata1 = data1; + ddata2 = data2; + } + + void assert_eq_ref(int size, int eq_size) + { + outdata = doutdata; + auto out_ptr = outdata.get_const_data(); + auto out_end = out_ptr + eq_size; + auto ref_ptr = refdata.get_data(); + std::copy_n(data1.get_const_data(), size, ref_ptr); + std::copy_n(data2.get_const_data(), size, ref_ptr + size); + std::sort(ref_ptr, ref_ptr + 2 * size); + + ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr)); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + std::default_random_engine rng; + + int rng_runs; + int rng_run; + int max_size; + std::vector sizes; + gko::Array data1; + gko::Array data2; + gko::Array outdata; + gko::Array refdata; + gko::Array ddata1; + gko::Array ddata2; + gko::Array doutdata; +}; + + +__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + auto i = warp.thread_rank(); + auto result = group_merge_step(a[i], b[i], config::warp_size, warp); + c[i] = min(result.a_val, result.b_val); +} + +TEST_F(Merging, MergeStep) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + test_merge_step<<<1, config::warp_size>>>(ddata1.get_const_data(), + ddata2.get_const_data(), + doutdata.get_data()); + + assert_eq_ref(config::warp_size, config::warp_size); + } +} + + +__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, + int i) { c[i] = min(a_val, b_val); }); +} + +TEST_F(Merging, FullMerge) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + test_merge<<<1, config::warp_size>>>(ddata1.get_const_data(), + ddata2.get_const_data(), size, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b, + int size, int separator, gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + group_merge_3way( + a, separator, a + separator, size - separator, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); +} + +TEST_F(Merging, FullMerge3Way) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + for (auto separator : + {0, 1, size / 3, 2 * size / 3, size - 1, size}) { + test_merge_3way<<<1, config::warp_size>>>( + ddata1.get_const_data(), ddata2.get_const_data(), size, + separator, doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } + } +} + + +__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c) +{ + sequential_merge( + a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); +} + +TEST_F(Merging, SequentialFullMerge) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + test_sequential_merge<<<1, 1>>>(ddata1.get_const_data(), + ddata2.get_const_data(), size, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_sequential_merge_3way(const gko::int32 *a, + const gko::int32 *b, int size, + int separator, gko::int32 *c) +{ + sequential_merge_3way( + a, separator, a + separator, size - separator, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); +} + +TEST_F(Merging, SequentialFull3WayMerge) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + for (auto separator : + {0, 1, size / 3, 2 * size / 3, size - 1, size}) { + test_sequential_merge_3way<<<1, 1>>>( + ddata1.get_const_data(), ddata2.get_const_data(), size, + separator, doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } + } +} + + +} // namespace diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp new file mode 100644 index 00000000000..a236784d389 --- /dev/null +++ b/hip/components/merging.hip.hpp @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_MERGING_CUH_ +#define GKO_HIP_COMPONENTS_MERGING_CUH_ + + +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/searching.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/merging.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_MERGING_CUH_ diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt index c0fbe34f6aa..30569774eb3 100644 --- a/hip/test/components/CMakeLists.txt +++ b/hip/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_hip_test(cooperative_groups) +ginkgo_create_hip_test(merging) ginkgo_create_hip_test(prefix_sum) ginkgo_create_hip_test(searching) ginkgo_create_hip_test(sorting) \ No newline at end of file diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp new file mode 100644 index 00000000000..fd12a71a54e --- /dev/null +++ b/hip/test/components/merging.hip.cpp @@ -0,0 +1,268 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// TODO remove when the HIP includes are fixed +#include + + +#include "hip/components/merging.hip.hpp" + + +#include +#include +#include + + +#include + + +#include +#include + + +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace { + + +using namespace gko::kernels::hip; +using namespace gko::kernels::hip::group; + + +class Merging : public ::testing::Test { +protected: + Merging() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)), + rng(123456), + rng_runs{100}, + rng_run{}, + max_size{1637}, + sizes{0, 1, 2, 3, 4, 10, 15, 16, + 31, 34, 102, 242, 534, 956, 1239, 1637}, + data1(ref, max_size), + data2(ref, max_size), + outdata(ref, 2 * max_size), + refdata(ref, 2 * max_size), + ddata1(hip), + ddata2(hip), + doutdata(hip, 2 * max_size) + {} + + void init_data() + { + std::uniform_int_distribution dist(0, max_size); + std::fill_n(data1.get_data(), max_size, 0); + std::fill_n(data2.get_data(), max_size, 0); + for (auto i = 0; i < max_size; ++i) { + // here we also want to test some corner cases + // first two runs: zero data1 + if (rng_run > 0) data1.get_data()[i] = dist(rng); + // first and third run: zero data2 + if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng); + } + std::sort(data1.get_data(), data1.get_data() + max_size); + std::sort(data2.get_data(), data2.get_data() + max_size); + + ddata1 = data1; + ddata2 = data2; + } + + void assert_eq_ref(int size, int eq_size) + { + outdata = doutdata; + auto out_ptr = outdata.get_const_data(); + auto out_end = out_ptr + eq_size; + auto ref_ptr = refdata.get_data(); + std::copy_n(data1.get_const_data(), size, ref_ptr); + std::copy_n(data2.get_const_data(), size, ref_ptr + size); + std::sort(ref_ptr, ref_ptr + 2 * size); + + ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr)); + } + + std::shared_ptr ref; + std::shared_ptr hip; + std::default_random_engine rng; + + int rng_runs; + int rng_run; + int max_size; + std::vector sizes; + gko::Array data1; + gko::Array data2; + gko::Array outdata; + gko::Array refdata; + gko::Array ddata1; + gko::Array ddata2; + gko::Array doutdata; +}; + + +__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + auto i = warp.thread_rank(); + auto result = group_merge_step(a[i], b[i], config::warp_size, warp); + c[i] = min(result.a_val, result.b_val); +} + +TEST_F(Merging, MergeStep) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), dim3(1), + dim3(config::warp_size), 0, 0, + ddata1.get_const_data(), ddata2.get_const_data(), + doutdata.get_data()); + + assert_eq_ref(config::warp_size, config::warp_size); + } +} + + +__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, + int i) { c[i] = min(a_val, b_val); }); +} + +TEST_F(Merging, FullMerge) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), dim3(1), + dim3(config::warp_size), 0, 0, + ddata1.get_const_data(), ddata2.get_const_data(), + size, doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b, + int size, int separator, gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + group_merge_3way( + a, separator, a + separator, size - separator, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); +} + +TEST_F(Merging, FullMerge3Way) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + for (auto separator : + {0, 1, size / 3, 2 * size / 3, size - 1, size}) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_3way), dim3(1), + dim3(config::warp_size), 0, 0, + ddata1.get_const_data(), + ddata2.get_const_data(), size, separator, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } + } +} + + +__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c) +{ + sequential_merge( + a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); +} + +TEST_F(Merging, SequentialFullMerge) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), dim3(1), + dim3(1), 0, 0, ddata1.get_const_data(), + ddata2.get_const_data(), size, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_sequential_merge_3way(const gko::int32 *a, + const gko::int32 *b, int size, + int separator, gko::int32 *c) +{ + sequential_merge_3way( + a, separator, a + separator, size - separator, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); +} + +TEST_F(Merging, SequentialFull3WayMerge) +{ + for (auto i = 0; i < rng_runs; ++i) { + init_data(); + for (auto size : sizes) { + for (auto separator : + {0, 1, size / 3, 2 * size / 3, size - 1, size}) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge_3way), + dim3(1), dim3(1), 0, 0, + ddata1.get_const_data(), + ddata2.get_const_data(), size, separator, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } + } +} + + +} // namespace From 9719fdc66de70f100e3fa53d75e5d288ad52859a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 22 Jan 2020 08:41:10 +0100 Subject: [PATCH 3/8] add references to algorithms --- common/components/merging.hpp.inc | 4 ++++ common/components/searching.hpp.inc | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc index 59a9b84a245..ef109e2837a 100644 --- a/common/components/merging.hpp.inc +++ b/common/components/merging.hpp.inc @@ -74,6 +74,10 @@ checked_load(const ValueType *p, IndexType i, IndexType size, * the value of `a` at thread i is smaller or equal the value at thread j, and * the same holds for `b`. * + * This implementation is based on ideas from Green et al., + * "GPU merge path: a GPU merging algorithm", but uses random-access warp + * shuffles instead of shared-memory to exchange values of a and b. + * * @param a the element from the first range * @param b the element from the second range * @param size the number of elements in the output range diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc index 222ab85de33..ccf4bdc6f80 100644 --- a/common/components/searching.hpp.inc +++ b/common/components/searching.hpp.inc @@ -139,6 +139,9 @@ __device__ int synchronous_binary_search(int size, Predicate p) * * It executes `log2(length / group.size())` coalescing calls to `p`. * + * This implementation is based on the w-wide search mentioned in + * Green et al., "GPU merge path: a GPU merging algorithm" + * * @param offset the starting index of the partitioned range * @param length the length of the partitioned range * @param group the coalescing group executing the search @@ -190,6 +193,9 @@ __device__ IndexType group_wide_search(IndexType offset, IndexType length, * It executes `log2(length) / log2(group.size())` calls to `p` that effectively * follow a random-access pattern. * + * This implementation is based on the w-partition search mentioned in + * Green et al., "GPU merge path: a GPU merging algorithm" + * * @param offset the starting index of the partitioned range * @param length the length of the partitioned range * @param group the coalescing group executing the search From 0fcdb2164b4a91a497182f5383c02f539403b047 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 22 Jan 2020 10:29:43 +0100 Subject: [PATCH 4/8] remove 3way merge, test all callback parameters --- common/components/merging.hpp.inc | 140 +--------------------------- cuda/test/components/merging.cu | 114 +++++++++++++--------- hip/test/components/merging.hip.cpp | 120 ++++++++++++++---------- 3 files changed, 147 insertions(+), 227 deletions(-) diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc index ef109e2837a..368fb7a1ecf 100644 --- a/common/components/merging.hpp.inc +++ b/common/components/merging.hpp.inc @@ -161,8 +161,9 @@ __device__ void group_merge(const ValueType *a, IndexType a_size, auto merge_size = min(group.size(), c_size - c_begin); auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group); if (c_begin + lane < c_size) { - merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx, - merge_result.b_val, c_begin + lane); + merge_fn(merge_result.a_idx + a_begin, merge_result.a_val, + merge_result.b_idx + b_begin, merge_result.b_val, + c_begin + lane); } auto a_advance = merge_result.a_advance; auto b_advance = merge_result.b_advance; @@ -197,85 +198,6 @@ __device__ void group_merge(const ValueType *a, IndexType a_size, } -/** - * @internal - * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary - * size, where the first range is stored in two halves. - * `merge_fun` will be called for each merged element. - * - * @param a1 the first half of the first range - * @param a1_begin the beginning offset of the first half of the first range - * @param a1_size the size of the first half of the first range. - * @param a2 the second half of the first range - * @param a2_begin the beginning offset of the second half of the first range - * @param a2_size the size of the second half of the first range. - * @param b the second range - * @param b_size the size of the second range - * @param group the group that executes the merge - * @param merge_fn the callback that is being called for each merged element. - * It takes five parameters: - * `IndexType a_idx, ValueType a_val, - * IndexType b_idx, ValueType b_val, IndexType c_index`. - * `*_val` and `*_idx` are the values resp. the indices of the - * values from a/b being compared at output index `c_index`. - */ -template -__device__ void group_merge_3way(const ValueType *a1, IndexType a1_size, - const ValueType *a2, IndexType a2_size, - const ValueType *b, IndexType b_size, - Group group, Callback merge_fn) -{ - auto a_size = a1_size + a2_size; - auto c_size = a_size + b_size; - IndexType a_begin{}; - IndexType b_begin{}; - IndexType c_begin{}; - auto lane = IndexType(group.thread_rank()); - auto a = lane < a1_size ? a1 : a2 - a1_size; - auto a_cur = detail::checked_load(a, a_begin + lane, a_size); - auto b_cur = detail::checked_load(b, b_begin + lane, a_size); - while (c_begin < c_size) { - auto merge_size = min(group.size(), c_size - c_begin); - auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group); - if (c_begin + lane < c_size) { - merge_fn(merge_result.a_idx, merge_result.a_val, merge_result.b_idx, - merge_result.b_val, c_begin + lane); - } - auto a_advance = merge_result.a_advance; - auto b_advance = merge_result.b_advance; - a_begin += a_advance; - b_begin += b_advance; - c_begin += group.size(); - a = a_begin + lane < a1_size ? a1 : a2 - a1_size; - - // shuffle the unmerged elements to the front - a_cur = group.shfl_down(a_cur, a_advance); - b_cur = group.shfl_down(b_cur, b_advance); - /* - * To optimize memory access, we load the new elements for `a` and `b` - * with a single load instruction: - * the lower part of the group loads new elements for `a` - * the upper part of the group loads new elements for `b` - * `load_lane` is the part-local lane idx - * The elements for `a` have to be shuffled up afterwards. - */ - auto load_a = lane < a_advance; - auto load_lane = load_a ? lane : lane - a_advance; - auto load_source = load_a ? a : b; - auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance; - auto load_size = load_a ? a_size : b_size; - - auto load_idx = load_begin + load_lane; - auto loaded = detail::checked_load(load_source, load_idx, load_size); - // shuffle the `a` values to the end of the warp - auto lower_loaded = group.shfl_up(loaded, b_advance); - a_cur = lane < b_advance ? a_cur : lower_loaded; - b_cur = lane < a_advance ? b_cur : loaded; - } -} - - /** * @internal * Sequential merge algorithm that merges two sorted ranges of arbitrary @@ -311,62 +233,6 @@ __device__ void sequential_merge(const ValueType *a, IndexType a_size, b_begin += b_advance; c_begin++; - auto load = a_advance ? a : b; - auto load_size = a_advance ? a_size : b_size; - auto load_idx = a_advance ? a_begin : b_begin; - auto loaded = detail::checked_load(load, load_idx, load_size); - a_cur = a_advance ? loaded : a_cur; - b_cur = b_advance ? loaded : b_cur; - } -} - - -/** - * @internal - * Sequential merge algorithm that merges two sorted ranges of arbitrary - * size, where the first range is stored in two halves. - * `merge_fun` will be called for each merged element. - * - * @param a1 the first half of the first range - * @param a1_begin the beginning offset of the first half of the first range - * @param a1_size the size of the first half of the first range. - * @param a2 the second half of the first range - * @param a2_begin the beginning offset of the second half of the first range - * @param a2_size the size of the second half of the first range. - * @param b the second range - * @param b_size the size of the second range - * @param merge_fn the callback that is being called for each merged element. - * It takes five parameters: - * `IndexType a_idx, ValueType a_val, - * IndexType b_idx, ValueType b_val, IndexType c_index`. - * `*_val` and `*_idx` are the values resp. the indices of the - * values from a/b being compared at output index `c_index`. - */ -template -__device__ void sequential_merge_3way(const ValueType *a1, IndexType a1_size, - const ValueType *a2, IndexType a2_size, - const ValueType *b, IndexType b_size, - Callback merge_fn) -{ - auto a = a1_size > 0 ? a1 : a2; - auto a_size = a1_size + a2_size; - auto c_size = a_size + b_size; - IndexType a_begin{}; - IndexType b_begin{}; - IndexType c_begin{}; - auto a_cur = detail::checked_load(a, a_begin, a_size); - auto b_cur = detail::checked_load(b, b_begin, b_size); - while (c_begin < c_size) { - merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin); - auto a_advance = a_cur < b_cur; - auto b_advance = !a_advance; - a_begin += a_advance; - b_begin += b_advance; - c_begin++; - if (a_begin == a1_size) { - a = a2 - a1_size; - } - auto load = a_advance ? a : b; auto load_size = a_advance ? a_size : b_size; auto load_idx = a_advance ? a_begin : b_begin; diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 9b8123a960d..9b51aac04b1 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -69,9 +69,21 @@ protected: data1(ref, max_size), data2(ref, max_size), outdata(ref, 2 * max_size), + idxs1(ref), + idxs2(ref), + idxs3(ref), + refidxs1(ref), + refidxs2(ref), + refidxs3(ref), refdata(ref, 2 * max_size), ddata1(cuda), ddata2(cuda), + didxs1(cuda, 2 * max_size), + didxs2(cuda, 2 * max_size), + didxs3(cuda, 2 * max_size), + drefidxs1(cuda, 2 * max_size), + drefidxs2(cuda, 2 * max_size), + drefidxs3(cuda, 2 * max_size), doutdata(cuda, 2 * max_size) {} @@ -117,10 +129,22 @@ protected: std::vector sizes; gko::Array data1; gko::Array data2; + gko::Array idxs1; + gko::Array idxs2; + gko::Array idxs3; + gko::Array refidxs1; + gko::Array refidxs2; + gko::Array refidxs3; gko::Array outdata; gko::Array refdata; gko::Array ddata1; gko::Array ddata2; + gko::Array didxs1; + gko::Array didxs2; + gko::Array didxs3; + gko::Array drefidxs1; + gko::Array drefidxs2; + gko::Array drefidxs3; gko::Array doutdata; }; @@ -171,35 +195,6 @@ TEST_F(Merging, FullMerge) } -__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b, - int size, int separator, gko::int32 *c) -{ - auto warp = tiled_partition(this_thread_block()); - group_merge_3way( - a, separator, a + separator, size - separator, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { - c[i] = min(a_val, b_val); - }); -} - -TEST_F(Merging, FullMerge3Way) -{ - for (auto i = 0; i < rng_runs; ++i) { - init_data(); - for (auto size : sizes) { - for (auto separator : - {0, 1, size / 3, 2 * size / 3, size - 1, size}) { - test_merge_3way<<<1, config::warp_size>>>( - ddata1.get_const_data(), ddata2.get_const_data(), size, - separator, doutdata.get_data()); - - assert_eq_ref(size, 2 * size); - } - } - } -} - - __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, int size, gko::int32 *c) { @@ -225,30 +220,63 @@ TEST_F(Merging, SequentialFullMerge) } -__global__ void test_sequential_merge_3way(const gko::int32 *a, - const gko::int32 *b, int size, - int separator, gko::int32 *c) +__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c, gko::int32 *aidxs, + gko::int32 *bidxs, gko::int32 *cidxs, + gko::int32 *refaidxs, gko::int32 *refbidxs, + gko::int32 *refcidxs) { - sequential_merge_3way( - a, separator, a + separator, size - separator, b, size, + if (threadIdx.x == 0) { + sequential_merge(a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i) { + refaidxs[i] = a_idx; + refbidxs[i] = b_idx; + refcidxs[i] = i; + }); + } + auto warp = tiled_partition(this_thread_block()); + group_merge( + a, size, b, size, warp, [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + aidxs[i] = a_idx; + bidxs[i] = b_idx; + cidxs[i] = i; c[i] = min(a_val, b_val); }); } -TEST_F(Merging, SequentialFull3WayMerge) +TEST_F(Merging, FullMergeIdxs) { for (auto i = 0; i < rng_runs; ++i) { init_data(); for (auto size : sizes) { - for (auto separator : - {0, 1, size / 3, 2 * size / 3, size - 1, size}) { - test_sequential_merge_3way<<<1, 1>>>( - ddata1.get_const_data(), ddata2.get_const_data(), size, - separator, doutdata.get_data()); - - assert_eq_ref(size, 2 * size); - } + test_merge_idxs<<<1, config::warp_size>>>( + ddata1.get_const_data(), ddata2.get_const_data(), size, + doutdata.get_data(), didxs1.get_data(), didxs2.get_data(), + didxs3.get_data(), drefidxs1.get_data(), drefidxs2.get_data(), + drefidxs3.get_data()); + + assert_eq_ref(size, 2 * size); + idxs1 = didxs1; + idxs2 = didxs2; + idxs3 = didxs3; + refidxs1 = drefidxs1; + refidxs2 = drefidxs2; + refidxs3 = drefidxs3; + auto idxs1_ptr = idxs1.get_const_data(); + auto idxs2_ptr = idxs2.get_const_data(); + auto idxs3_ptr = idxs3.get_const_data(); + auto refidxs1_ptr = refidxs1.get_const_data(); + auto refidxs2_ptr = refidxs2.get_const_data(); + auto refidxs3_ptr = refidxs3.get_const_data(); + + ASSERT_TRUE( + std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr)); + ASSERT_TRUE( + std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr)); + ASSERT_TRUE( + std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr)); } } } diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index fd12a71a54e..b281a5e8e1b 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -73,9 +73,21 @@ class Merging : public ::testing::Test { data1(ref, max_size), data2(ref, max_size), outdata(ref, 2 * max_size), + idxs1(ref), + idxs2(ref), + idxs3(ref), + refidxs1(ref), + refidxs2(ref), + refidxs3(ref), refdata(ref, 2 * max_size), ddata1(hip), ddata2(hip), + didxs1(hip, 2 * max_size), + didxs2(hip, 2 * max_size), + didxs3(hip, 2 * max_size), + drefidxs1(hip, 2 * max_size), + drefidxs2(hip, 2 * max_size), + drefidxs3(hip, 2 * max_size), doutdata(hip, 2 * max_size) {} @@ -121,10 +133,22 @@ class Merging : public ::testing::Test { std::vector sizes; gko::Array data1; gko::Array data2; + gko::Array idxs1; + gko::Array idxs2; + gko::Array idxs3; + gko::Array refidxs1; + gko::Array refidxs2; + gko::Array refidxs3; gko::Array outdata; gko::Array refdata; gko::Array ddata1; gko::Array ddata2; + gko::Array didxs1; + gko::Array didxs2; + gko::Array didxs3; + gko::Array drefidxs1; + gko::Array drefidxs2; + gko::Array drefidxs3; gko::Array doutdata; }; @@ -177,37 +201,6 @@ TEST_F(Merging, FullMerge) } -__global__ void test_merge_3way(const gko::int32 *a, const gko::int32 *b, - int size, int separator, gko::int32 *c) -{ - auto warp = tiled_partition(this_thread_block()); - group_merge_3way( - a, separator, a + separator, size - separator, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { - c[i] = min(a_val, b_val); - }); -} - -TEST_F(Merging, FullMerge3Way) -{ - for (auto i = 0; i < rng_runs; ++i) { - init_data(); - for (auto size : sizes) { - for (auto separator : - {0, 1, size / 3, 2 * size / 3, size - 1, size}) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_3way), dim3(1), - dim3(config::warp_size), 0, 0, - ddata1.get_const_data(), - ddata2.get_const_data(), size, separator, - doutdata.get_data()); - - assert_eq_ref(size, 2 * size); - } - } - } -} - - __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, int size, gko::int32 *c) { @@ -234,32 +227,65 @@ TEST_F(Merging, SequentialFullMerge) } -__global__ void test_sequential_merge_3way(const gko::int32 *a, - const gko::int32 *b, int size, - int separator, gko::int32 *c) +__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c, gko::int32 *aidxs, + gko::int32 *bidxs, gko::int32 *cidxs, + gko::int32 *refaidxs, gko::int32 *refbidxs, + gko::int32 *refcidxs) { - sequential_merge_3way( - a, separator, a + separator, size - separator, b, size, + if (threadIdx.x == 0) { + sequential_merge(a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i) { + refaidxs[i] = a_idx; + refbidxs[i] = b_idx; + refcidxs[i] = i; + }); + } + auto warp = tiled_partition(this_thread_block()); + group_merge( + a, size, b, size, warp, [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + aidxs[i] = a_idx; + bidxs[i] = b_idx; + cidxs[i] = i; c[i] = min(a_val, b_val); }); } -TEST_F(Merging, SequentialFull3WayMerge) +TEST_F(Merging, FullMergeIdxs) { for (auto i = 0; i < rng_runs; ++i) { init_data(); for (auto size : sizes) { - for (auto separator : - {0, 1, size / 3, 2 * size / 3, size - 1, size}) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge_3way), - dim3(1), dim3(1), 0, 0, - ddata1.get_const_data(), - ddata2.get_const_data(), size, separator, - doutdata.get_data()); - - assert_eq_ref(size, 2 * size); - } + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_idxs), dim3(1), + dim3(config::warp_size), 0, 0, + ddata1.get_const_data(), ddata2.get_const_data(), + size, doutdata.get_data(), didxs1.get_data(), + didxs2.get_data(), didxs3.get_data(), + drefidxs1.get_data(), drefidxs2.get_data(), + drefidxs3.get_data()); + + assert_eq_ref(size, 2 * size); + idxs1 = didxs1; + idxs2 = didxs2; + idxs3 = didxs3; + refidxs1 = drefidxs1; + refidxs2 = drefidxs2; + refidxs3 = drefidxs3; + auto idxs1_ptr = idxs1.get_const_data(); + auto idxs2_ptr = idxs2.get_const_data(); + auto idxs3_ptr = idxs3.get_const_data(); + auto refidxs1_ptr = refidxs1.get_const_data(); + auto refidxs2_ptr = refidxs2.get_const_data(); + auto refidxs3_ptr = refidxs3.get_const_data(); + + ASSERT_TRUE( + std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr)); + ASSERT_TRUE( + std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr)); + ASSERT_TRUE( + std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr)); } } } From 82542f2012b7218fdf9586979e64a861b09cacc7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 22 Jan 2020 17:23:18 +0100 Subject: [PATCH 5/8] unroll parallel merge loop by default --- common/components/merging.hpp.inc | 20 +++++++------------- cuda/test/components/merging.cu | 12 +++++++----- hip/test/components/merging.hip.cpp | 12 +++++++----- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc index 368fb7a1ecf..a33dc0e3dc8 100644 --- a/common/components/merging.hpp.inc +++ b/common/components/merging.hpp.inc @@ -84,14 +84,11 @@ checked_load(const ValueType *p, IndexType i, IndexType size, * @param group the cooperative group that executes the merge * @return a structure containing the merge result distributed over the group. */ -template +template __device__ detail::merge_result group_merge_step(ValueType a, ValueType b, - int size, Group group) { - // round up to the next power of two - auto size_pow2 = 1 << (32 - clz(uint32(size - 1))); // thread i takes care of ith element of the merged sequence auto i = int(group.thread_rank()); @@ -107,9 +104,7 @@ __device__ detail::merge_result group_merge_step(ValueType a, // b[i - x] >= a[x - 1] // and a[x] >= a[0...x - 1], b[0...i - x - 1] // => merge a[x] with b[i - x] - auto minx = synchronous_binary_search(size_pow2, [&](int x) { - // potentially faster (because of unrolling): - // auto minx = synchronous_fixed_binary_search([&](int x){ + auto minx = synchronous_fixed_binary_search([&](int x) { auto a_remote = group.shfl(a, x); auto b_remote = group.shfl(b, max(i - x - 1, 0)); return a_remote >= b_remote || x >= i; @@ -144,8 +139,8 @@ __device__ detail::merge_result group_merge_step(ValueType a, * `*_val` and `*_idx` are the values resp. the indices of the * values from a/b being compared at output index `c_index`. */ -template +template __device__ void group_merge(const ValueType *a, IndexType a_size, const ValueType *b, IndexType b_size, Group group, Callback merge_fn) @@ -158,8 +153,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size, auto a_cur = detail::checked_load(a, a_begin + lane, a_size); auto b_cur = detail::checked_load(b, b_begin + lane, a_size); while (c_begin < c_size) { - auto merge_size = min(group.size(), c_size - c_begin); - auto merge_result = group_merge_step(a_cur, b_cur, merge_size, group); + auto merge_result = group_merge_step(a_cur, b_cur, group); if (c_begin + lane < c_size) { merge_fn(merge_result.a_idx + a_begin, merge_result.a_val, merge_result.b_idx + b_begin, merge_result.b_val, @@ -169,7 +163,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size, auto b_advance = merge_result.b_advance; a_begin += a_advance; b_begin += b_advance; - c_begin += group.size(); + c_begin += group_size; // shuffle the unmerged elements to the front a_cur = group.shfl_down(a_cur, a_advance); @@ -240,4 +234,4 @@ __device__ void sequential_merge(const ValueType *a, IndexType a_size, a_cur = a_advance ? loaded : a_cur; b_cur = b_advance ? loaded : b_cur; } -} \ No newline at end of file +} diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 9b51aac04b1..1bd14ca584c 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -154,7 +154,7 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, { auto warp = tiled_partition(this_thread_block()); auto i = warp.thread_rank(); - auto result = group_merge_step(a[i], b[i], config::warp_size, warp); + auto result = group_merge_step(a[i], b[i], warp); c[i] = min(result.a_val, result.b_val); } @@ -175,9 +175,11 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, gko::int32 *c) { auto warp = tiled_partition(this_thread_block()); - group_merge(a, size, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, - int i) { c[i] = min(a_val, b_val); }); + group_merge( + a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); } TEST_F(Merging, FullMerge) @@ -236,7 +238,7 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, }); } auto warp = tiled_partition(this_thread_block()); - group_merge( + group_merge( a, size, b, size, warp, [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { aidxs[i] = a_idx; diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index b281a5e8e1b..886a6713d82 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -158,7 +158,7 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, { auto warp = tiled_partition(this_thread_block()); auto i = warp.thread_rank(); - auto result = group_merge_step(a[i], b[i], config::warp_size, warp); + auto result = group_merge_step(a[i], b[i], warp); c[i] = min(result.a_val, result.b_val); } @@ -180,9 +180,11 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, gko::int32 *c) { auto warp = tiled_partition(this_thread_block()); - group_merge(a, size, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, - int i) { c[i] = min(a_val, b_val); }); + group_merge( + a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + }); } TEST_F(Merging, FullMerge) @@ -243,7 +245,7 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, }); } auto warp = tiled_partition(this_thread_block()); - group_merge( + group_merge( a, size, b, size, warp, [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { aidxs[i] = a_idx; From f3be9d50d67b25b96cb13ec0d185a733b1ed66ca Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 26 Jan 2020 21:33:15 +0100 Subject: [PATCH 6/8] review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher Co-authored-by: Terry Cojean --- common/components/merging.hpp.inc | 12 ++++++------ common/components/searching.hpp.inc | 11 ++++++++--- cuda/components/merging.cuh | 1 + cuda/components/searching.cuh | 1 - cuda/test/components/merging.cu | 27 +++++++++++++-------------- cuda/test/components/searching.cu | 9 +++++---- hip/components/merging.hip.hpp | 1 + hip/components/searching.hip.hpp | 1 - hip/test/components/merging.hip.cpp | 27 +++++++++++++-------------- hip/test/components/searching.hip.cpp | 9 +++++---- 10 files changed, 52 insertions(+), 47 deletions(-) diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc index a33dc0e3dc8..adbd60a4873 100644 --- a/common/components/merging.hpp.inc +++ b/common/components/merging.hpp.inc @@ -35,7 +35,7 @@ namespace detail { /** * @internal - * The result from the @ref parallel_merge_step function. + * The result from the @ref group_merge_step function. */ template struct merge_result { @@ -71,8 +71,8 @@ checked_load(const ValueType *p, IndexType i, IndexType size, * Warp-parallel merge algorithm that merges the first `warp_size` elements from * two ranges, where each warp stores a single element from each range. * It assumes that the elements are sorted in ascending order, i.e. for i < j, - * the value of `a` at thread i is smaller or equal the value at thread j, and - * the same holds for `b`. + * the value of `a` at thread i is smaller or equal to the value at thread j, + * and the same holds for `b`. * * This implementation is based on ideas from Green et al., * "GPU merge path: a GPU merging algorithm", but uses random-access warp @@ -115,7 +115,7 @@ __device__ detail::merge_result group_merge_step(ValueType a, auto a_val = group.shfl(a, a_idx); auto b_val = group.shfl(b, b_idx); auto cmp = a_val < b_val; - auto a_advance = int(popcnt(group.ballot(cmp))); + auto a_advance = popcnt(group.ballot(cmp)); auto b_advance = int(group.size()) - a_advance; return {a_val, b_val, a_idx, b_idx, a_advance, b_advance}; @@ -125,7 +125,7 @@ __device__ detail::merge_result group_merge_step(ValueType a, /** * @internal * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary - * size. `merge_fun` will be called for each merged element. + * size. `merge_fn` will be called for each merged element. * * @param a the first range * @param a_size the size of the first range @@ -195,7 +195,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size, /** * @internal * Sequential merge algorithm that merges two sorted ranges of arbitrary - * size. `merge_fun` will be called for each merged element. + * size. `merge_fn` will be called for each merged element. * * @param a the first range * @param a_size the size of the first range diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc index ccf4bdc6f80..51e54525810 100644 --- a/common/components/searching.hpp.inc +++ b/common/components/searching.hpp.inc @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * into two subranges [offset, middle), [middle, offset + length) such that * the predicate is `false` for all elements in the first range and `true` for * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. * The implementation is based on Stepanov & McJones, "Elements of Programming". * * @param offset the starting index of the partitioned range @@ -69,6 +70,7 @@ __device__ IndexType binary_search(IndexType offset, IndexType length, * into two subranges [offset, middle), [middle, offset + length) such that * the predicate is `false` for all elements in the first range and `true` for * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. * * @tparam size the length of the partitioned range - must be a power of two * @param p the predicate to be evaluated on the range - it should not have @@ -81,7 +83,7 @@ __device__ int synchronous_fixed_binary_search(Predicate p) if (size == 0) { return 0; } - auto begin = 0; + int begin{}; static_assert(size > 0, "size must be positive"); static_assert(!(size & (size - 1)), "size must be a power of two"); #pragma unroll @@ -105,6 +107,7 @@ __device__ int synchronous_fixed_binary_search(Predicate p) * into two subranges [offset, middle), [middle, offset + length) such that * the predicate is `false` for all elements in the first range and `true` for * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. * * @param size the length of the partitioned range - must be a power of two * @param p the predicate to be evaluated on the range - it should not have @@ -117,7 +120,7 @@ __device__ int synchronous_binary_search(int size, Predicate p) if (size == 0) { return 0; } - auto begin = 0; + int begin{}; for (auto cur_size = size; cur_size > 1; cur_size /= 2) { auto half_size = cur_size / 2; auto mid = begin + half_size; @@ -136,6 +139,7 @@ __device__ int synchronous_binary_search(int size, Predicate p) * into two subranges [offset, middle), [middle, offset + length) such that * the predicate is `false` for all elements in the first range and `true` for * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. * * It executes `log2(length / group.size())` coalescing calls to `p`. * @@ -189,6 +193,7 @@ __device__ IndexType group_wide_search(IndexType offset, IndexType length, * into two subranges [offset, middle), [middle, offset + length) such that * the predicate is `false` for all elements in the first range and `true` for * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. * * It executes `log2(length) / log2(group.size())` calls to `p` that effectively * follow a random-access pattern. @@ -227,4 +232,4 @@ __device__ IndexType group_ary_search(IndexType offset, IndexType length, auto mask = group.ballot(idx >= end || p(idx)); auto pos = mask == 0 ? group.size() : ffs(mask) - 1; return offset + pos; -} \ No newline at end of file +} diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh index b1a2ffa0f21..c84ed260d4f 100644 --- a/cuda/components/merging.cuh +++ b/cuda/components/merging.cuh @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_COMPONENTS_MERGING_CUH_ +#include "cuda/base/math.hpp" #include "cuda/components/intrinsics.cuh" #include "cuda/components/searching.cuh" diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh index 4ebeceba720..186123e04f3 100644 --- a/cuda/components/searching.cuh +++ b/cuda/components/searching.cuh @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/config.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 1bd14ca584c..28c09510cc9 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -62,7 +63,6 @@ protected: cuda(gko::CudaExecutor::create(0, ref)), rng(123456), rng_runs{100}, - rng_run{}, max_size{1637}, sizes{0, 1, 2, 3, 4, 10, 15, 16, 31, 34, 102, 242, 534, 956, 1239, 1637}, @@ -87,17 +87,17 @@ protected: doutdata(cuda, 2 * max_size) {} - void init_data() + void init_data(int rng_run) { std::uniform_int_distribution dist(0, max_size); std::fill_n(data1.get_data(), max_size, 0); std::fill_n(data2.get_data(), max_size, 0); - for (auto i = 0; i < max_size; ++i) { + for (int i = 0; i < max_size; ++i) { // here we also want to test some corner cases // first two runs: zero data1 - if (rng_run > 0) data1.get_data()[i] = dist(rng); + if (rng_run > 1) data1.get_data()[i] = dist(rng); // first and third run: zero data2 - if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng); + if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng); } std::sort(data1.get_data(), data1.get_data() + max_size); std::sort(data2.get_data(), data2.get_data() + max_size); @@ -124,7 +124,6 @@ protected: std::default_random_engine rng; int rng_runs; - int rng_run; int max_size; std::vector sizes; gko::Array data1; @@ -160,8 +159,8 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, TEST_F(Merging, MergeStep) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); test_merge_step<<<1, config::warp_size>>>(ddata1.get_const_data(), ddata2.get_const_data(), doutdata.get_data()); @@ -184,8 +183,8 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, TEST_F(Merging, FullMerge) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); for (auto size : sizes) { test_merge<<<1, config::warp_size>>>(ddata1.get_const_data(), ddata2.get_const_data(), size, @@ -209,8 +208,8 @@ __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, TEST_F(Merging, SequentialFullMerge) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); for (auto size : sizes) { test_sequential_merge<<<1, 1>>>(ddata1.get_const_data(), ddata2.get_const_data(), size, @@ -250,8 +249,8 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, TEST_F(Merging, FullMergeIdxs) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); for (auto size : sizes) { test_merge_idxs<<<1, config::warp_size>>>( ddata1.get_const_data(), ddata2.get_const_data(), size, diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu index da9f89f48ee..80bba023655 100644 --- a/cuda/test/components/searching.cu +++ b/cuda/test/components/searching.cu @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include @@ -97,7 +98,7 @@ __device__ void test_assert(bool *success, bool predicate) __global__ void test_binary_search(bool *success, int offset, int size) { - // test binary search on [0, size) + // test binary search on [offset, offset + size) // for all possible partition points auto result = binary_search(offset, size, [&](int i) { // don't access out-of-bounds! @@ -147,7 +148,7 @@ TEST_F(Searching, BinaryEmptyOffset) __global__ void test_sync_binary_search(bool *success, int, int size) { - // test binary search on [0, warp_size) + // test binary search on [0, size) // for all possible partition points auto result = synchronous_binary_search(size, [&](int i) { // don't access out-of-bounds! @@ -187,7 +188,7 @@ TEST_F(Searching, EmptySyncBinary) __global__ void test_warp_ary_search(bool *success, int offset, int size) { - // test binary search on [0, length) + // test binary search on [offset, offset + size) // for all possible partition points auto warp = group::tiled_partition(this_thread_block()); auto result = group_ary_search(offset, size, warp, [&](int i) { @@ -215,7 +216,7 @@ TEST_F(Searching, WarpAryOffset) __global__ void test_warp_wide_search(bool *success, int offset, int size) { - // test binary search on [0, length) + // test binary search on [offset, offset + size) // for all possible partition points auto warp = group::tiled_partition(this_thread_block()); auto result = group_wide_search(offset, size, warp, [&](int i) { diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp index a236784d389..dde43a64885 100644 --- a/hip/components/merging.hip.hpp +++ b/hip/components/merging.hip.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HIP_COMPONENTS_MERGING_CUH_ +#include "hip/base/math.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/searching.hip.hpp" diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp index 8211121afa4..d2d7e909a12 100644 --- a/hip/components/searching.hip.hpp +++ b/hip/components/searching.hip.hpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/config.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index 886a6713d82..fe3650fb504 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -66,7 +67,6 @@ class Merging : public ::testing::Test { hip(gko::HipExecutor::create(0, ref)), rng(123456), rng_runs{100}, - rng_run{}, max_size{1637}, sizes{0, 1, 2, 3, 4, 10, 15, 16, 31, 34, 102, 242, 534, 956, 1239, 1637}, @@ -91,17 +91,17 @@ class Merging : public ::testing::Test { doutdata(hip, 2 * max_size) {} - void init_data() + void init_data(int rng_run) { std::uniform_int_distribution dist(0, max_size); std::fill_n(data1.get_data(), max_size, 0); std::fill_n(data2.get_data(), max_size, 0); - for (auto i = 0; i < max_size; ++i) { + for (int i = 0; i < max_size; ++i) { // here we also want to test some corner cases // first two runs: zero data1 - if (rng_run > 0) data1.get_data()[i] = dist(rng); + if (rng_run > 1) data1.get_data()[i] = dist(rng); // first and third run: zero data2 - if (rng_run > 3 || rng_run == 1) data2.get_data()[i] = dist(rng); + if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng); } std::sort(data1.get_data(), data1.get_data() + max_size); std::sort(data2.get_data(), data2.get_data() + max_size); @@ -128,7 +128,6 @@ class Merging : public ::testing::Test { std::default_random_engine rng; int rng_runs; - int rng_run; int max_size; std::vector sizes; gko::Array data1; @@ -164,8 +163,8 @@ __global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, TEST_F(Merging, MergeStep) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), dim3(1), dim3(config::warp_size), 0, 0, ddata1.get_const_data(), ddata2.get_const_data(), @@ -189,8 +188,8 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, TEST_F(Merging, FullMerge) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); for (auto size : sizes) { hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), dim3(1), dim3(config::warp_size), 0, 0, @@ -215,8 +214,8 @@ __global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, TEST_F(Merging, SequentialFullMerge) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); for (auto size : sizes) { hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), dim3(1), dim3(1), 0, 0, ddata1.get_const_data(), @@ -257,8 +256,8 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, TEST_F(Merging, FullMergeIdxs) { - for (auto i = 0; i < rng_runs; ++i) { - init_data(); + for (int i = 0; i < rng_runs; ++i) { + init_data(i); for (auto size : sizes) { hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_idxs), dim3(1), dim3(config::warp_size), 0, 0, diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 53cf4fc6b53..4797a7d6726 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include @@ -102,7 +103,7 @@ __device__ void test_assert(bool *success, bool predicate) __global__ void test_binary_search(bool *success, int offset, int size) { - // test binary search on [0, size) + // test binary search on [offset, offset + size) // for all possible partition points auto result = binary_search(offset, size, [&](int i) { // don't access out-of-bounds! @@ -152,7 +153,7 @@ TEST_F(Searching, BinaryEmptyOffset) __global__ void test_sync_binary_search(bool *success, int, int size) { - // test binary search on [0, warp_size) + // test binary search on [0, size) // for all possible partition points auto result = synchronous_binary_search(size, [&](int i) { // don't access out-of-bounds! @@ -192,7 +193,7 @@ TEST_F(Searching, EmptySyncBinary) __global__ void test_warp_ary_search(bool *success, int offset, int size) { - // test binary search on [0, length) + // test binary search on [offset, offset + size) // for all possible partition points auto warp = tiled_partition(this_thread_block()); auto result = group_ary_search(offset, size, warp, [&](int i) { @@ -220,7 +221,7 @@ TEST_F(Searching, WarpAryOffset) __global__ void test_warp_wide_search(bool *success, int offset, int size) { - // test binary search on [0, length) + // test binary search on [offset, offset + size) // for all possible partition points auto warp = tiled_partition(this_thread_block()); auto result = group_wide_search(offset, size, warp, [&](int i) { From 27cfdb8571ae8ab4396eee2bcf193d768e925850 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 28 Jan 2020 14:24:32 +0100 Subject: [PATCH 7/8] improve merge callback interface --- common/components/intrinsics.hpp.inc | 12 +++++------ common/components/merging.hpp.inc | 20 +++++++++---------- cuda/test/components/merging.cu | 30 ++++++++++++++++------------ hip/test/components/merging.hip.cpp | 30 ++++++++++++++++------------ 4 files changed, 50 insertions(+), 42 deletions(-) diff --git a/common/components/intrinsics.hpp.inc b/common/components/intrinsics.hpp.inc index ac88ab02c7f..5fc8b07e1d0 100644 --- a/common/components/intrinsics.hpp.inc +++ b/common/components/intrinsics.hpp.inc @@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @internal * Returns the number of set bits in the given mask. */ -__device__ int popcnt(uint32 mask) { return __popc(mask); } +inline __device__ int popcnt(uint32 mask) { return __popc(mask); } /** @copydoc popcnt */ -__device__ int popcnt(uint64 mask) { return __popcll(mask); } +inline __device__ int popcnt(uint64 mask) { return __popcll(mask); } /** @@ -45,10 +45,10 @@ __device__ int popcnt(uint64 mask) { return __popcll(mask); } * Returns the (1-based!) index of the first set bit in the given mask, * starting from the least significant bit. */ -__device__ int ffs(uint32 mask) { return __ffs(mask); } +inline __device__ int ffs(uint32 mask) { return __ffs(mask); } /** @copydoc ffs */ -__device__ int ffs(uint64 mask) +inline __device__ int ffs(uint64 mask) { // the cast is necessary, as the overloads defined by HIP are ambiguous return __ffsll(static_cast(mask)); @@ -60,7 +60,7 @@ __device__ int ffs(uint64 mask) * Returns the number of zero bits before the first set bit in the given mask, * starting from the most significant bit. */ -__device__ int clz(uint32 mask) { return __clz(mask); } +inline __device__ int clz(uint32 mask) { return __clz(mask); } /** @copydoc clz */ -__device__ int clz(uint64 mask) { return __clzll(mask); } +inline __device__ int clz(uint64 mask) { return __clzll(mask); } diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc index adbd60a4873..fd27f9e61b6 100644 --- a/common/components/merging.hpp.inc +++ b/common/components/merging.hpp.inc @@ -133,11 +133,13 @@ __device__ detail::merge_result group_merge_step(ValueType a, * @param b_size the size of the second range * @param group the group that executes the merge * @param merge_fn the callback that is being called for each merged element. - * It takes five parameters: - * `IndexType a_idx, ValueType a_val, - * IndexType b_idx, ValueType b_val, IndexType c_index`. - * `*_val` and `*_idx` are the values resp. the indices of the - * values from a/b being compared at output index `c_index`. + * It takes six parameters: + * `IndexType a_idx, ValueType a_val, IndexType b_idx, + * ValueType b_val, IndexType c_index, bool valid`. + * `*_val` and `*_idx` are the values resp. the indices of the + * values from a/b being compared at output index `c_index`. + * `valid` specifies if the current thread has to merge an + * element (this is necessary for shfl and ballot operations). */ template @@ -154,11 +156,9 @@ __device__ void group_merge(const ValueType *a, IndexType a_size, auto b_cur = detail::checked_load(b, b_begin + lane, a_size); while (c_begin < c_size) { auto merge_result = group_merge_step(a_cur, b_cur, group); - if (c_begin + lane < c_size) { - merge_fn(merge_result.a_idx + a_begin, merge_result.a_val, - merge_result.b_idx + b_begin, merge_result.b_val, - c_begin + lane); - } + merge_fn(merge_result.a_idx + a_begin, merge_result.a_val, + merge_result.b_idx + b_begin, merge_result.b_val, + c_begin + lane, c_begin + lane < c_size); auto a_advance = merge_result.a_advance; auto b_advance = merge_result.b_advance; a_begin += a_advance; diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 28c09510cc9..450785325fe 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -174,11 +174,13 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, gko::int32 *c) { auto warp = tiled_partition(this_thread_block()); - group_merge( - a, size, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { - c[i] = min(a_val, b_val); - }); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + c[i] = min(a_val, b_val); + } + }); } TEST_F(Merging, FullMerge) @@ -237,14 +239,16 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, }); } auto warp = tiled_partition(this_thread_block()); - group_merge( - a, size, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { - aidxs[i] = a_idx; - bidxs[i] = b_idx; - cidxs[i] = i; - c[i] = min(a_val, b_val); - }); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + aidxs[i] = a_idx; + bidxs[i] = b_idx; + cidxs[i] = i; + c[i] = min(a_val, b_val); + } + }); } TEST_F(Merging, FullMergeIdxs) diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index fe3650fb504..cde8a89c218 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -179,11 +179,13 @@ __global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, gko::int32 *c) { auto warp = tiled_partition(this_thread_block()); - group_merge( - a, size, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { - c[i] = min(a_val, b_val); - }); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + c[i] = min(a_val, b_val); + } + }); } TEST_F(Merging, FullMerge) @@ -244,14 +246,16 @@ __global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, }); } auto warp = tiled_partition(this_thread_block()); - group_merge( - a, size, b, size, warp, - [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { - aidxs[i] = a_idx; - bidxs[i] = b_idx; - cidxs[i] = i; - c[i] = min(a_val, b_val); - }); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + aidxs[i] = a_idx; + bidxs[i] = b_idx; + cidxs[i] = i; + c[i] = min(a_val, b_val); + } + }); } TEST_F(Merging, FullMergeIdxs) From 142d8917fd5f3efc1ae01d42ab53219aed04f142 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 28 Jan 2020 15:25:38 +0100 Subject: [PATCH 8/8] replace inline by __forceinline__ for intrinsics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Thomas Grützmacher --- common/components/intrinsics.hpp.inc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/components/intrinsics.hpp.inc b/common/components/intrinsics.hpp.inc index 5fc8b07e1d0..f89fa434eb4 100644 --- a/common/components/intrinsics.hpp.inc +++ b/common/components/intrinsics.hpp.inc @@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @internal * Returns the number of set bits in the given mask. */ -inline __device__ int popcnt(uint32 mask) { return __popc(mask); } +__forceinline__ __device__ int popcnt(uint32 mask) { return __popc(mask); } /** @copydoc popcnt */ -inline __device__ int popcnt(uint64 mask) { return __popcll(mask); } +__forceinline__ __device__ int popcnt(uint64 mask) { return __popcll(mask); } /** @@ -45,10 +45,10 @@ inline __device__ int popcnt(uint64 mask) { return __popcll(mask); } * Returns the (1-based!) index of the first set bit in the given mask, * starting from the least significant bit. */ -inline __device__ int ffs(uint32 mask) { return __ffs(mask); } +__forceinline__ __device__ int ffs(uint32 mask) { return __ffs(mask); } /** @copydoc ffs */ -inline __device__ int ffs(uint64 mask) +__forceinline__ __device__ int ffs(uint64 mask) { // the cast is necessary, as the overloads defined by HIP are ambiguous return __ffsll(static_cast(mask)); @@ -60,7 +60,7 @@ inline __device__ int ffs(uint64 mask) * Returns the number of zero bits before the first set bit in the given mask, * starting from the most significant bit. */ -inline __device__ int clz(uint32 mask) { return __clz(mask); } +__forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); } /** @copydoc clz */ -inline __device__ int clz(uint64 mask) { return __clzll(mask); } +__forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); }