ginkgo-project · yhmtsai · Nov 16, 2019 · Oct 28, 2019 · Oct 28, 2019 · Oct 28, 2019
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
@@ -53,8 +53,10 @@ function(ginkgo_create_hip_test test_name)
     target_include_directories("${TEST_TARGET_NAME}"
         PRIVATE
             "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
-
-            # Only `exception_helpers` requires these so far, but it's much easier
+            # Only `math` requires it so far, but it's much easier
+            # to put these this way.
+            ${GINKGO_HIP_THRUST_PATH}
+            # Only `exception_helpers` requires thess so far, but it's much easier
             # to put these this way.
             ${HIPBLAS_INCLUDE_DIRS}
             ${HIPSPARSE_INCLUDE_DIRS}

diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc
@@ -0,0 +1,125 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2019, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ * First step of the calculation of a prefix sum. Calculates the prefix sum
+ * in-place on parts of the array `elements`.
+ *
+ * @param block_size  thread block size for this kernel, also size of blocks on
+ *                    which this kernel calculates the prefix sum in-place
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array which stores the total sum of each block, requires at
+ *                   least `ceildiv(num_elements, block_size)` elements
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @note To calculate the prefix sum over an array of size bigger than
+ *       `block_size`, `finalize_prefix_sum` has to be used as well.
+ */
+template <int block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void start_prefix_sum(
+    size_type num_elements, ValueType *__restrict__ elements,
+    ValueType *__restrict__ block_sum)
+{
+    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto element_id = threadIdx.x;
+    __shared__ size_type prefix_helper[block_size];
+    prefix_helper[element_id] =
+        (tidx < num_elements) ? elements[tidx] : zero<ValueType>();
+    auto this_block = group::this_thread_block();
+    this_block.sync();
+
+    // Do a normal reduction
+#pragma unroll
+    for (int i = 1; i < block_size; i <<= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            prefix_helper[bi] += prefix_helper[ai];
+        }
+        this_block.sync();
+    }
+
+    if (element_id == 0) {
+        // Store the total sum
+        block_sum[blockIdx.x] = prefix_helper[block_size - 1];
+        prefix_helper[block_size - 1] = zero<ValueType>();
+    }
+
+    this_block.sync();
+
+    // Perform the down-sweep phase to get the true prefix sum
+#pragma unroll
+    for (int i = block_size >> 1; i > 0; i >>= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            auto tmp = prefix_helper[ai];
+            prefix_helper[ai] = prefix_helper[bi];
+            prefix_helper[bi] += tmp;
+        }
+        this_block.sync();
+    }
+    if (tidx < num_elements) {
+        elements[tidx] = prefix_helper[element_id];
+    }
+}
+
+
+/**
+ * @internal
+ * Second step of the calculation of a prefix sum. Increases the value of each
+ * entry of `elements` by the total sum of all preceding blocks.
+ *
+ * @param block_size  thread block size for this kernel, has to be the same as
+ *                    for `start_prefix_sum`
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array storing the total sum of each block
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
+ */
+template <int block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_prefix_sum(
+    size_type num_elements, ValueType *__restrict__ elements,
+    const ValueType *__restrict__ block_sum)
+{
+    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (tidx < num_elements) {
+        ValueType prefix_block_sum = zero<ValueType>();
+        for (size_type i = 0; i < blockIdx.x; i++) {
+            prefix_block_sum += block_sum[i];
+        }
+        elements[tidx] += prefix_block_sum;
+    }
+}
diff --git a/common/components/reduction.hpp.inc b/common/components/reduction.hpp.inc
@@ -0,0 +1,178 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2019, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on a group
+ * `group`. Each thread contributes with one element `local_data`. The local
+ * thread element is always passed as the first parameter to the `reduce_op`.
+ * The function returns the result of the reduction on all threads.
+ *
+ * @note The function is guaranteed to return the correct value on all threads
+ *       only if `reduce_op` is commutative (in addition to being associative).
+ *       Otherwise, the correct value is returned only to the thread with
+ *       subwarp index 0.
+ */
+template <
+    typename Group, typename ValueType, typename Operator,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ ValueType reduce(const Group &group,
+                                            ValueType local_data,
+                                            Operator reduce_op = Operator{})
+{
+#pragma unroll
+    for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) {
+        const auto remote_data = group.shfl_xor(local_data, bitmask);
+        local_data = reduce_op(local_data, remote_data);
+    }
+    return local_data;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the index of the thread that has the element with the largest
+ * magnitude among all the threads in the group.
+ * Only the values from threads which set `is_pivoted` to `false` will be
+ * considered.
+ */
+template <
+    typename Group, typename ValueType,
+    typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
+__device__ __forceinline__ int choose_pivot(const Group &group,
+                                            ValueType local_data,
+                                            bool is_pivoted)
+{
+    using real = remove_complex<ValueType>;
+    real lmag = is_pivoted ? -one<real>() : abs(local_data);
+    const auto pivot =
+        reduce(group, group.thread_rank(), [&](int lidx, int ridx) {
+            const auto rmag = group.shfl(lmag, ridx);
+            if (rmag > lmag) {
+                lmag = rmag;
+                lidx = ridx;
+            }
+            return lidx;
+        });
+    // pivot operator not commutative, make sure everyone has the same pivot
+    return group.shfl(pivot, 0);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on entire block.
+ * The data for the reduction is taken from the `data` array which has to be of
+ * size `block_size` and accessible from all threads. The `data` array is also
+ * used as work space (so its content will be destroyed in the process), as well
+ * as to store the return value - which is stored in the 0-th position of the
+ * array.
+ */
+template <
+    typename Group, typename ValueType, typename Operator,
+    typename = xstd::enable_if_t<group::is_synchronizable_group<Group>::value>>
+__device__ void reduce(const Group &__restrict__ group,
+                       ValueType *__restrict__ data,
+                       Operator reduce_op = Operator{})
+{
+    const auto local_id = group.thread_rank();
+
+#pragma unroll
+    for (int k = group.size() / 2; k >= config::warp_size; k /= 2) {
+        group.sync();
+        if (local_id < k) {
+            data[local_id] = reduce_op(data[local_id], data[local_id + k]);
+        }
+    }
+
+    const auto warp = group::tiled_partition<config::warp_size>(group);
+    const auto warp_id = group.thread_rank() / warp.size();
+    if (warp_id > 0) {
+        return;
+    }
+    auto result = reduce(warp, data[warp.thread_rank()], reduce_op);
+    if (warp.thread_rank() == 0) {
+        data[0] = result;
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `block_size`.
+ */
+template <typename Operator, typename ValueType>
+__device__ void reduce_array(size_type size,
+                             const ValueType *__restrict__ source,
+                             ValueType *__restrict__ result,
+                             Operator reduce_op = Operator{})
+{
+    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+    auto thread_result = zero<ValueType>();
+    for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) {
+        thread_result = reduce_op(thread_result, source[i]);
+    }
+    result[threadIdx.x] = thread_result;
+
+    group::this_thread_block().sync();
+
+    // Stores the result of the reduction inside `result[0]`
+    reduce(group::this_thread_block(), result, reduce_op);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the add operation (+) on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `default_block_size`.
+ */
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void reduce_add_array(
+    size_type size, const ValueType *__restrict__ source,
+    ValueType *__restrict__ result)
+{
+    __shared__ UninitializedArray<ValueType, default_block_size> block_sum;
+    reduce_array(size, source, static_cast<ValueType *>(block_sum),
+                 [](const ValueType &x, const ValueType &y) { return x + y; });
+
+    if (threadIdx.x == 0) {
+        result[blockIdx.x] = block_sum[0];
+    }
+}