Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

port cuda matrix into hip #373

Merged
merged 17 commits into from
Nov 16, 2019
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions cmake/create_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ function(ginkgo_create_hip_test test_name)
target_include_directories("${TEST_TARGET_NAME}"
PRIVATE
"$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"

# Only `exception_helpers` requires these so far, but it's much easier
# Only `math` requires it so far, but it's much easier
# to put these this way.
${GINKGO_HIP_THRUST_PATH}
# Only `exception_helpers` requires thess so far, but it's much easier
# to put these this way.
${HIPBLAS_INCLUDE_DIRS}
${HIPSPARSE_INCLUDE_DIRS}
Expand Down
125 changes: 125 additions & 0 deletions common/components/prefix_sum.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2019, the Ginkgo authors
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

/**
* @internal
* First step of the calculation of a prefix sum. Calculates the prefix sum
* in-place on parts of the array `elements`.
*
* @param block_size thread block size for this kernel, also size of blocks on
* which this kernel calculates the prefix sum in-place
* @param elements array on which the prefix sum is to be calculated
* @param block_sum array which stores the total sum of each block, requires at
* least `ceildiv(num_elements, block_size)` elements
* @param num_elements total number of entries in `elements`
*
* @note To calculate the prefix sum over an array of size bigger than
* `block_size`, `finalize_prefix_sum` has to be used as well.
*/
template <int block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void start_prefix_sum(
size_type num_elements, ValueType *__restrict__ elements,
ValueType *__restrict__ block_sum)
{
const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
const auto element_id = threadIdx.x;
__shared__ size_type prefix_helper[block_size];
prefix_helper[element_id] =
(tidx < num_elements) ? elements[tidx] : zero<ValueType>();
auto this_block = group::this_thread_block();
this_block.sync();

// Do a normal reduction
#pragma unroll
for (int i = 1; i < block_size; i <<= 1) {
const auto ai = i * (2 * element_id + 1) - 1;
const auto bi = i * (2 * element_id + 2) - 1;
if (bi < block_size) {
prefix_helper[bi] += prefix_helper[ai];
}
this_block.sync();
}

if (element_id == 0) {
// Store the total sum
block_sum[blockIdx.x] = prefix_helper[block_size - 1];
prefix_helper[block_size - 1] = zero<ValueType>();
}

this_block.sync();

// Perform the down-sweep phase to get the true prefix sum
#pragma unroll
for (int i = block_size >> 1; i > 0; i >>= 1) {
const auto ai = i * (2 * element_id + 1) - 1;
const auto bi = i * (2 * element_id + 2) - 1;
if (bi < block_size) {
auto tmp = prefix_helper[ai];
prefix_helper[ai] = prefix_helper[bi];
prefix_helper[bi] += tmp;
}
this_block.sync();
}
if (tidx < num_elements) {
elements[tidx] = prefix_helper[element_id];
}
}


/**
* @internal
* Second step of the calculation of a prefix sum. Increases the value of each
* entry of `elements` by the total sum of all preceding blocks.
*
* @param block_size thread block size for this kernel, has to be the same as
* for `start_prefix_sum`
* @param elements array on which the prefix sum is to be calculated
* @param block_sum array storing the total sum of each block
* @param num_elements total number of entries in `elements`
*
* @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
*/
template <int block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void finalize_prefix_sum(
size_type num_elements, ValueType *__restrict__ elements,
const ValueType *__restrict__ block_sum)
{
const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;

if (tidx < num_elements) {
ValueType prefix_block_sum = zero<ValueType>();
for (size_type i = 0; i < blockIdx.x; i++) {
prefix_block_sum += block_sum[i];
}
elements[tidx] += prefix_block_sum;
}
}
178 changes: 178 additions & 0 deletions common/components/reduction.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2019, the Ginkgo authors
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

/**
* @internal
*
* Computes a reduction using the binary operation `reduce_op` on a group
* `group`. Each thread contributes with one element `local_data`. The local
* thread element is always passed as the first parameter to the `reduce_op`.
* The function returns the result of the reduction on all threads.
*
* @note The function is guaranteed to return the correct value on all threads
* only if `reduce_op` is commutative (in addition to being associative).
* Otherwise, the correct value is returned only to the thread with
* subwarp index 0.
*/
template <
typename Group, typename ValueType, typename Operator,
typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
__device__ __forceinline__ ValueType reduce(const Group &group,
ValueType local_data,
Operator reduce_op = Operator{})
{
#pragma unroll
for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) {
const auto remote_data = group.shfl_xor(local_data, bitmask);
local_data = reduce_op(local_data, remote_data);
}
return local_data;
}


/**
* @internal
*
* Returns the index of the thread that has the element with the largest
* magnitude among all the threads in the group.
* Only the values from threads which set `is_pivoted` to `false` will be
* considered.
*/
template <
typename Group, typename ValueType,
typename = xstd::enable_if_t<group::is_communicator_group<Group>::value>>
__device__ __forceinline__ int choose_pivot(const Group &group,
ValueType local_data,
bool is_pivoted)
{
using real = remove_complex<ValueType>;
real lmag = is_pivoted ? -one<real>() : abs(local_data);
const auto pivot =
reduce(group, group.thread_rank(), [&](int lidx, int ridx) {
const auto rmag = group.shfl(lmag, ridx);
if (rmag > lmag) {
lmag = rmag;
lidx = ridx;
}
return lidx;
});
// pivot operator not commutative, make sure everyone has the same pivot
return group.shfl(pivot, 0);
}


/**
* @internal
*
* Computes a reduction using the binary operation `reduce_op` on entire block.
* The data for the reduction is taken from the `data` array which has to be of
* size `block_size` and accessible from all threads. The `data` array is also
* used as work space (so its content will be destroyed in the process), as well
* as to store the return value - which is stored in the 0-th position of the
* array.
*/
template <
typename Group, typename ValueType, typename Operator,
typename = xstd::enable_if_t<group::is_synchronizable_group<Group>::value>>
__device__ void reduce(const Group &__restrict__ group,
ValueType *__restrict__ data,
Operator reduce_op = Operator{})
{
const auto local_id = group.thread_rank();

#pragma unroll
for (int k = group.size() / 2; k >= config::warp_size; k /= 2) {
group.sync();
if (local_id < k) {
data[local_id] = reduce_op(data[local_id], data[local_id + k]);
}
}

const auto warp = group::tiled_partition<config::warp_size>(group);
const auto warp_id = group.thread_rank() / warp.size();
if (warp_id > 0) {
return;
}
auto result = reduce(warp, data[warp.thread_rank()], reduce_op);
if (warp.thread_rank() == 0) {
data[0] = result;
}
}


/**
* @internal
*
* Computes a reduction using the binary operation `reduce_op` on an array
* `source` of any size. Has to be called a second time on `result` to reduce
* an array larger than `block_size`.
*/
template <typename Operator, typename ValueType>
__device__ void reduce_array(size_type size,
const ValueType *__restrict__ source,
ValueType *__restrict__ result,
Operator reduce_op = Operator{})
{
const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
auto thread_result = zero<ValueType>();
for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) {
thread_result = reduce_op(thread_result, source[i]);
}
result[threadIdx.x] = thread_result;

group::this_thread_block().sync();

// Stores the result of the reduction inside `result[0]`
reduce(group::this_thread_block(), result, reduce_op);
}


/**
* @internal
*
* Computes a reduction using the add operation (+) on an array
* `source` of any size. Has to be called a second time on `result` to reduce
* an array larger than `default_block_size`.
*/
template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void reduce_add_array(
size_type size, const ValueType *__restrict__ source,
ValueType *__restrict__ result)
{
__shared__ UninitializedArray<ValueType, default_block_size> block_sum;
reduce_array(size, source, static_cast<ValueType *>(block_sum),
[](const ValueType &x, const ValueType &y) { return x + y; });

if (threadIdx.x == 0) {
result[blockIdx.x] = block_sum[0];
}
}
Loading