Skip to content

Commit

Permalink
Merge add CUDA, HIP and OMP kernels for ISAI
Browse files Browse the repository at this point in the history
This PR adds ISAI kernels for the executors CUDA, HIP and OpenMP.


Related PR: #502
  • Loading branch information
Thomas Grützmacher authored Apr 24, 2020
2 parents 8a7ece4 + 845853d commit 90db75d
Show file tree
Hide file tree
Showing 12 changed files with 932 additions and 6 deletions.
3 changes: 3 additions & 0 deletions common/components/uninitialized_array.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**
* Stores an array with uninitialized contents.
*
* This class needed for datatypes that do have a non-empty constructor when`
* using them as shared memory, for example `thrust::complex<float>`.
*
* @tparam ValueType the type of values
* @tparam size the size of the array
*/
Expand Down
182 changes: 182 additions & 0 deletions common/preconditioner/isai_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,185 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

namespace kernel {


/**
* @internal
*
* This kernel supports at most `subwarp_size` (< `warp_size`) elements per row.
* If there are more elements, they are simply ignored. Only the first
* `subwarp_size` elements are considered both for the values and for the
* sparsity pattern.
*/
template <int subwarp_size, int subwarps_per_block, typename ValueType,
typename IndexType, typename Callable>
__forceinline__ __device__ void generic_generate(
IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
const IndexType *__restrict__ m_col_idxs,
const ValueType *__restrict__ m_values, IndexType *__restrict__ i_row_ptrs,
IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values,
Callable trs_solve)
{
static_assert((subwarp_size & (subwarp_size - 1)) == 0,
"subwarp_size must be a power of 2");

const auto gidx = thread::get_thread_id_flat<IndexType>();
const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
const int tidx = threadIdx.x;

auto this_block = group::this_thread_block();
auto subwarp = group::tiled_partition<subwarp_size>(this_block);
const int local_id = subwarp.thread_rank();

__shared__ UninitializedArray<ValueType, subwarp_size * subwarp_size *
subwarps_per_block>
trisystem;

const int end_trisystem = blockDim.x * subwarp_size;
const int trisystem_start =
(tidx / subwarp_size) * subwarp_size * subwarp_size;

// Allow at most subwarp_size * subwarp_size elements in a subwarp trisystem
for (int i = 0; i < subwarp_size; ++i) {
trisystem[trisystem_start + local_id + i * subwarp_size] =
zero<ValueType>();
}
this_block.sync();

if (row >= num_rows) {
return;
}

const auto i_row_begin = i_row_ptrs[row];
const auto i_row_end = i_row_ptrs[row + 1] - i_row_begin < subwarp_size
? i_row_ptrs[row + 1]
: i_row_begin + subwarp_size;
const auto num_row_elems = i_row_end - i_row_begin;

if (local_id < num_row_elems) {
const auto col = i_col_idxs[i_row_begin + local_id];
const auto m_row_end = m_row_ptrs[col + 1];
auto m_row_ptr = m_row_ptrs[col];
auto i_row_ptr = i_row_begin;
int idx{trisystem_start + local_id}; // write to trisystem in row major

while (m_row_ptr < m_row_end && i_row_ptr < i_row_end &&
idx < end_trisystem) {
const auto sparsity_col = i_col_idxs[i_row_ptr];
const auto m_col = m_col_idxs[m_row_ptr];
if (sparsity_col == m_col) {
trisystem[idx] = m_values[m_row_ptr];
idx += subwarp_size;
++m_row_ptr;
++i_row_ptr;
} else if (m_col < sparsity_col) {
++m_row_ptr;
} else {
++i_row_ptr;
idx += subwarp_size;
}
}
}

this_block.sync();

// Now, read a full col of `trisystem` into local registers, which will be
// row elements after this (implicit) transpose
ValueType local_row[subwarp_size];
const auto trisystem_col_start = trisystem_start + local_id * subwarp_size;
for (int i = 0; i < subwarp_size; ++i) {
const auto intermediate_idx = local_id + i;
const auto coalesced_idx = intermediate_idx % subwarp_size;
local_row[coalesced_idx] =
trisystem[trisystem_col_start + coalesced_idx];
}

const auto rhs = trs_solve(num_row_elems, local_row, subwarp);

// Write back:
if (local_id < num_row_elems) {
const auto idx = i_row_begin + local_id;
if (gko::isfinite(rhs)) {
i_values[idx] = rhs;
} else {
i_values[idx] =
i_col_idxs[idx] == row ? one<ValueType>() : zero<ValueType>();
}
}
}


template <int subwarp_size, int subwarps_per_block, typename ValueType,
typename IndexType>
__global__
__launch_bounds__(subwarp_size *subwarps_per_block) void generate_l_inverse(
IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
const IndexType *__restrict__ m_col_idxs,
const ValueType *__restrict__ m_values, IndexType *__restrict__ i_row_ptrs,
IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values)
{
auto trs_solve = [](IndexType num_elems,
const ValueType *__restrict__ local_row,
group::thread_block_tile<subwarp_size> &subwarp) {
const int local_id = subwarp.thread_rank();
ValueType rhs =
local_id == num_elems - 1 ? one<ValueType>() : zero<ValueType>();
// Solve Triangular system
for (int d_col = num_elems - 1; d_col >= 0; --d_col) {
const auto elem = local_row[d_col];
if (d_col == local_id) {
rhs /= elem;
}

const ValueType bot = subwarp.shfl(rhs, d_col);
if (local_id < d_col) {
rhs -= bot * elem;
}
}

return rhs;
};
generic_generate<subwarp_size, subwarps_per_block>(
num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
i_values, trs_solve);
}


template <int subwarp_size, int subwarps_per_block, typename ValueType,
typename IndexType>
__global__
__launch_bounds__(subwarp_size *subwarps_per_block) void generate_u_inverse(
IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
const IndexType *__restrict__ m_col_idxs,
const ValueType *__restrict__ m_values, IndexType *__restrict__ i_row_ptrs,
IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values)
{
auto trs_solve = [](IndexType num_elems,
const ValueType *__restrict__ local_row,
group::thread_block_tile<subwarp_size> &subwarp) {
const int local_id = subwarp.thread_rank();
ValueType rhs = local_id == 0 ? one<ValueType>() : zero<ValueType>();
// Solve Triangular system
for (int d_col = 0; d_col < num_elems; ++d_col) {
const auto elem = local_row[d_col];
if (d_col == local_id) {
rhs /= elem;
}

const ValueType top = subwarp.shfl(rhs, d_col);
if (d_col < local_id) {
rhs -= top * elem;
}
}

return rhs;
};
generic_generate<subwarp_size, subwarps_per_block>(
num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
i_values, trs_solve);
}


} // namespace kernel
55 changes: 53 additions & 2 deletions cuda/preconditioner/isai_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/executor.hpp>
#include <ginkgo/core/matrix/csr.hpp>


#include "core/matrix/csr_builder.hpp"
#include "cuda/base/config.hpp"
#include "cuda/base/math.hpp"
#include "cuda/base/types.hpp"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/thread_ids.cuh"
#include "cuda/components/uninitialized_array.hpp"


namespace gko {
Expand All @@ -59,11 +65,36 @@ namespace {
} // namespace


constexpr int subwarp_size{config::warp_size};
constexpr int subwarps_per_block{2};
constexpr int default_block_size{subwarps_per_block * subwarp_size};


template <typename ValueType, typename IndexType>
void generate_l_inverse(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType> *l_csr,
matrix::Csr<ValueType, IndexType> *inverse_l)
GKO_NOT_IMPLEMENTED;
{
const auto nnz = l_csr->get_num_stored_elements();
const auto num_rows = l_csr->get_size()[0];

exec->copy_from(exec.get(), nnz, l_csr->get_const_col_idxs(),
inverse_l->get_col_idxs());
exec->copy_from(exec.get(), num_rows + 1, l_csr->get_const_row_ptrs(),
inverse_l->get_row_ptrs());


const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(num_rows, block.x / config::warp_size), 1, 1);
kernel::generate_l_inverse<subwarp_size, subwarps_per_block>
<<<grid, block>>>(
static_cast<IndexType>(num_rows), l_csr->get_const_row_ptrs(),
l_csr->get_const_col_idxs(),
as_cuda_type(l_csr->get_const_values()), inverse_l->get_row_ptrs(),
inverse_l->get_col_idxs(), as_cuda_type(inverse_l->get_values()));
// Call make_srow()
matrix::CsrBuilder<ValueType, IndexType> builder(inverse_l);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_ISAI_GENERATE_L_INVERSE_KERNEL);
Expand All @@ -73,7 +104,27 @@ template <typename ValueType, typename IndexType>
void generate_u_inverse(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType> *u_csr,
matrix::Csr<ValueType, IndexType> *inverse_u)
GKO_NOT_IMPLEMENTED;
{
const auto nnz = u_csr->get_num_stored_elements();
const auto num_rows = u_csr->get_size()[0];

exec->copy_from(exec.get(), nnz, u_csr->get_const_col_idxs(),
inverse_u->get_col_idxs());
exec->copy_from(exec.get(), num_rows + 1, u_csr->get_const_row_ptrs(),
inverse_u->get_row_ptrs());


const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(num_rows, block.x / config::warp_size), 1, 1);
kernel::generate_u_inverse<subwarp_size, subwarps_per_block>
<<<grid, block>>>(
static_cast<IndexType>(num_rows), u_csr->get_const_row_ptrs(),
u_csr->get_const_col_idxs(),
as_cuda_type(u_csr->get_const_values()), inverse_u->get_row_ptrs(),
inverse_u->get_col_idxs(), as_cuda_type(inverse_u->get_values()));
// Call make_srow()
matrix::CsrBuilder<ValueType, IndexType> builder(inverse_u);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_ISAI_GENERATE_U_INVERSE_KERNEL);
Expand Down
1 change: 1 addition & 0 deletions cuda/test/preconditioner/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
ginkgo_create_test(jacobi_kernels)
ginkgo_create_test(isai_kernels)
Loading

0 comments on commit 90db75d

Please sign in to comment.