Skip to content

Commit

Permalink
add GPU permutation kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Oct 25, 2020
1 parent b854c30 commit 0483978
Show file tree
Hide file tree
Showing 11 changed files with 580 additions and 254 deletions.
117 changes: 117 additions & 0 deletions common/matrix/csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -946,3 +946,120 @@ __global__ __launch_bounds__(default_block_size) void conjugate_kernel(


} // namespace


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void inv_permutation_kernel(
size_type size, const IndexType *__restrict__ permutation,
IndexType *__restrict__ inv_permutation)
{
auto tid = thread::get_thread_id_flat();
if (tid >= size) {
return;
}
inv_permutation[permutation[tid]] = tid;
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void col_permute_kernel(
size_type num_rows, size_type num_nonzeros,
const IndexType *__restrict__ permutation,
const IndexType *__restrict__ in_row_ptrs,
const IndexType *__restrict__ in_cols,
const ValueType *__restrict__ in_vals, IndexType *__restrict__ out_row_ptrs,
IndexType *__restrict__ out_cols, ValueType *__restrict__ out_vals)
{
auto tid = thread::get_thread_id_flat();
if (tid < num_nonzeros) {
out_cols[tid] = permutation[in_cols[tid]];
out_vals[tid] = in_vals[tid];
}
if (tid <= num_rows) {
out_row_ptrs[tid] = in_row_ptrs[tid];
}
}


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel(
size_type num_rows, const IndexType *__restrict__ permutation,
const IndexType *__restrict__ in_row_ptrs, IndexType *__restrict__ out_nnz)
{
auto tid = thread::get_thread_id_flat();
if (tid >= num_rows) {
return;
}
auto in_row = permutation[tid];
auto out_row = tid;
out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row];
}


template <typename IndexType>
__global__
__launch_bounds__(default_block_size) void inv_row_ptr_permute_kernel(
size_type num_rows, const IndexType *__restrict__ permutation,
const IndexType *__restrict__ in_row_ptrs,
IndexType *__restrict__ out_nnz)
{
auto tid = thread::get_thread_id_flat();
if (tid >= num_rows) {
return;
}
auto in_row = tid;
auto out_row = permutation[tid];
out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row];
}


template <int subwarp_size, typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void row_permute_kernel(
size_type num_rows, const IndexType *__restrict__ permutation,
const IndexType *__restrict__ in_row_ptrs,
const IndexType *__restrict__ in_cols,
const ValueType *__restrict__ in_vals,
const IndexType *__restrict__ out_row_ptrs,
IndexType *__restrict__ out_cols, ValueType *__restrict__ out_vals)
{
auto tid = thread::get_subwarp_id_flat<subwarp_size>();
if (tid >= num_rows) {
return;
}
auto lane = threadIdx.x % subwarp_size;
auto in_row = permutation[tid];
auto out_row = tid;
auto in_begin = in_row_ptrs[in_row];
auto in_size = in_row_ptrs[in_row + 1] - in_begin;
auto out_begin = out_row_ptrs[out_row];
for (IndexType i = lane; i < in_size; i += subwarp_size) {
out_cols[out_begin + i] = in_cols[in_begin + i];
out_vals[out_begin + i] = in_vals[in_begin + i];
}
}


template <int subwarp_size, typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel(
size_type num_rows, const IndexType *__restrict__ permutation,
const IndexType *__restrict__ in_row_ptrs,
const IndexType *__restrict__ in_cols,
const ValueType *__restrict__ in_vals,
const IndexType *__restrict__ out_row_ptrs,
IndexType *__restrict__ out_cols, ValueType *__restrict__ out_vals)
{
auto tid = thread::get_subwarp_id_flat<subwarp_size>();
if (tid >= num_rows) {
return;
}
auto lane = threadIdx.x % subwarp_size;
auto in_row = tid;
auto out_row = permutation[tid];
auto in_begin = in_row_ptrs[in_row];
auto in_size = in_row_ptrs[in_row + 1] - in_begin;
auto out_begin = out_row_ptrs[out_row];
for (IndexType i = lane; i < in_size; i += subwarp_size) {
out_cols[out_begin + i] = in_cols[in_begin + i];
out_vals[out_begin + i] = in_vals[in_begin + i];
}
}
11 changes: 5 additions & 6 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,22 +647,21 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
template <typename IndexType>
GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType)
Expand Down
24 changes: 16 additions & 8 deletions core/matrix/csr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid);
GKO_REGISTER_OPERATION(transpose, csr::transpose);
GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose);
GKO_REGISTER_OPERATION(row_permute, csr::row_permute);
GKO_REGISTER_OPERATION(column_permute, csr::column_permute);
GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute);
GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute);
GKO_REGISTER_OPERATION(invert_permutation, csr::invert_permutation);
GKO_REGISTER_OPERATION(calculate_max_nnz_per_row,
csr::calculate_max_nnz_per_row);
GKO_REGISTER_OPERATION(calculate_nonzeros_per_row,
Expand Down Expand Up @@ -394,8 +394,8 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::row_permute(
Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
this->get_strategy());

exec->run(
csr::make_row_permute(permutation_indices, this, permute_cpy.get()));
exec->run(csr::make_row_permute(permutation_indices->get_const_data(), this,
permute_cpy.get()));
permute_cpy->make_srow();
return std::move(permute_cpy);
}
Expand All @@ -410,10 +410,15 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::column_permute(
auto permute_cpy =
Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
this->get_strategy());
Array<IndexType> inv_permutation(exec, this->get_size()[1]);

exec->run(
csr::make_column_permute(permutation_indices, this, permute_cpy.get()));
exec->run(csr::make_invert_permutation(
this->get_size()[1], permutation_indices->get_const_data(),
inv_permutation.get_data()));
exec->run(csr::make_inverse_column_permute(inv_permutation.get_const_data(),
this, permute_cpy.get()));
permute_cpy->make_srow();
permute_cpy->sort_by_column_index();
return std::move(permute_cpy);
}

Expand All @@ -429,8 +434,9 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_row_permute(
Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
this->get_strategy());

exec->run(csr::make_inverse_row_permute(inverse_permutation_indices, this,
inverse_permute_cpy.get()));
exec->run(csr::make_inverse_row_permute(
inverse_permutation_indices->get_const_data(), this,
inverse_permute_cpy.get()));
inverse_permute_cpy->make_srow();
return std::move(inverse_permute_cpy);
}
Expand All @@ -448,8 +454,10 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_column_permute(
this->get_strategy());

exec->run(csr::make_inverse_column_permute(
inverse_permutation_indices, this, inverse_permute_cpy.get()));
inverse_permutation_indices->get_const_data(), this,
inverse_permute_cpy.get()));
inverse_permute_cpy->make_srow();
inverse_permute_cpy->sort_by_column_index();
return std::move(inverse_permute_cpy);
}

Expand Down
21 changes: 10 additions & 11 deletions core/matrix/csr_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,29 +131,28 @@ namespace kernels {

#define GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType) \
void row_permute(std::shared_ptr<const DefaultExecutor> exec, \
const Array<IndexType> *permutation_indices, \
const IndexType *permutation_indices, \
const matrix::Csr<ValueType, IndexType> *orig, \
matrix::Csr<ValueType, IndexType> *row_permuted)

#define GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \
void column_permute(std::shared_ptr<const DefaultExecutor> exec, \
const Array<IndexType> *permutation_indices, \
const matrix::Csr<ValueType, IndexType> *orig, \
matrix::Csr<ValueType, IndexType> *column_permuted)

#define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType) \
void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec, \
const Array<IndexType> *permutation_indices, \
const IndexType *permutation_indices, \
const matrix::Csr<ValueType, IndexType> *orig, \
matrix::Csr<ValueType, IndexType> *row_permuted)

#define GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \
void inverse_column_permute( \
std::shared_ptr<const DefaultExecutor> exec, \
const Array<IndexType> *permutation_indices, \
const IndexType *permutation_indices, \
const matrix::Csr<ValueType, IndexType> *orig, \
matrix::Csr<ValueType, IndexType> *column_permuted)

#define GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType) \
void invert_permutation( \
std::shared_ptr<const DefaultExecutor> exec, size_type size, \
const IndexType *permutation_indices, IndexType *inv_permutation)

#define GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) \
void calculate_max_nnz_per_row( \
std::shared_ptr<const DefaultExecutor> exec, \
Expand Down Expand Up @@ -210,11 +209,11 @@ namespace kernels {
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \
template <typename IndexType> \
GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
75 changes: 58 additions & 17 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1143,45 +1143,86 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);


template <typename IndexType>
void invert_permutation(std::shared_ptr<const DefaultExecutor> exec,
size_type size, const IndexType *permutation_indices,
IndexType *inv_permutation)
{
auto num_blocks = ceildiv(size, default_block_size);
inv_permutation_kernel<<<num_blocks, default_block_size>>>(
size, permutation_indices, inv_permutation);
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL);


template <typename ValueType, typename IndexType>
void row_permute(std::shared_ptr<const CudaExecutor> exec,
const Array<IndexType> *permutation_indices,
const IndexType *perm,
const matrix::Csr<ValueType, IndexType> *orig,
matrix::Csr<ValueType, IndexType> *row_permuted)
GKO_NOT_IMPLEMENTED;
{
auto num_rows = orig->get_size()[0];
auto count_num_blocks = ceildiv(num_rows, default_block_size);
row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
num_rows, perm, orig->get_const_row_ptrs(),
row_permuted->get_row_ptrs());
components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1);
auto copy_num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
row_permute_kernel<config::warp_size>
<<<copy_num_blocks, default_block_size>>>(
num_rows, perm, orig->get_const_row_ptrs(),
orig->get_const_col_idxs(), orig->get_const_values(),
row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
row_permuted->get_values());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);


template <typename ValueType, typename IndexType>
void column_permute(std::shared_ptr<const CudaExecutor> exec,
const Array<IndexType> *permutation_indices,
const matrix::Csr<ValueType, IndexType> *orig,
matrix::Csr<ValueType, IndexType> *column_permuted)
GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);


template <typename ValueType, typename IndexType>
void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
const Array<IndexType> *permutation_indices,
const IndexType *perm,
const matrix::Csr<ValueType, IndexType> *orig,
matrix::Csr<ValueType, IndexType> *row_permuted)
GKO_NOT_IMPLEMENTED;
{
auto num_rows = orig->get_size()[0];
auto count_num_blocks = ceildiv(num_rows, default_block_size);
inv_row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
num_rows, perm, orig->get_const_row_ptrs(),
row_permuted->get_row_ptrs());
components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1);
auto copy_num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
inv_row_permute_kernel<config::warp_size>
<<<copy_num_blocks, default_block_size>>>(
num_rows, perm, orig->get_const_row_ptrs(),
orig->get_const_col_idxs(), orig->get_const_values(),
row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
row_permuted->get_values());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);


template <typename ValueType, typename IndexType>
void inverse_column_permute(std::shared_ptr<const CudaExecutor> exec,
const Array<IndexType> *permutation_indices,
const IndexType *perm,
const matrix::Csr<ValueType, IndexType> *orig,
matrix::Csr<ValueType, IndexType> *column_permuted)
GKO_NOT_IMPLEMENTED;
{
auto num_rows = orig->get_size()[0];
auto nnz = orig->get_num_stored_elements();
auto num_blocks = ceildiv(std::max(num_rows, nnz), default_block_size);
col_permute_kernel<<<num_blocks, default_block_size>>>(
num_rows, nnz, perm, orig->get_const_row_ptrs(),
orig->get_const_col_idxs(), as_cuda_type(orig->get_const_values()),
column_permuted->get_row_ptrs(), column_permuted->get_col_idxs(),
as_cuda_type(column_permuted->get_values()));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
Expand Down
Loading

0 comments on commit 0483978

Please sign in to comment.