Skip to content

Commit

Permalink
add approx selection, reference kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Feb 9, 2020
1 parent da56899 commit b5ab36d
Show file tree
Hide file tree
Showing 16 changed files with 865 additions and 64 deletions.
2 changes: 1 addition & 1 deletion common/components/merging.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
IndexType a_begin{};
IndexType b_begin{};
IndexType c_begin{};
auto lane = IndexType(group.thread_rank());
auto lane = static_cast<IndexType>(group.thread_rank());
auto a_cur = detail::checked_load(a, a_begin + lane, a_size);
auto b_cur = detail::checked_load(b, b_begin + lane, a_size);
while (c_begin < c_size) {
Expand Down
124 changes: 92 additions & 32 deletions common/factorization/par_ilut_filter_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
namespace kernel {


template <int subwarp_size, typename IndexType, typename ValueType,
template <int subwarp_size, typename IndexType, typename Predicate,
typename BeginCallback, typename StepCallback,
typename FinishCallback>
__device__ void abstract_threshold_filter(
const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
remove_complex<ValueType> threshold, BeginCallback begin_cb,
StepCallback step_cb, FinishCallback finish_cb)
__device__ void abstract_filter_impl(const IndexType *row_ptrs,
IndexType num_rows, Predicate pred,
BeginCallback begin_cb,
StepCallback step_cb,
FinishCallback finish_cb)
{
auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
auto subwarp = group::thread_block_tile<subwarp_size>();
Expand All @@ -50,36 +51,31 @@ __device__ void abstract_threshold_filter(
return;
}

ValueType val{};
auto begin = row_ptrs[row];
auto end = row_ptrs[row + 1];
begin_cb(row);
auto diag_idx = end - 1;
auto num_steps = ceildiv(end - begin, config::warp_size);
for (auto step = 0; step < num_steps; ++step) {
auto idx = begin + lane + step * config::warp_size;
if (idx < end) {
val = vals[idx];
}
auto keep = idx < end && (abs(val) >= threshold || idx == diag_idx);
auto keep = idx < end && pred(idx, begin, end);
auto mask = subwarp.ballot(keep);
step_cb(row, idx, val, keep, popcnt(mask),
popcnt(mask & lane_prefix_mask));
step_cb(row, idx, keep, popcnt(mask), popcnt(mask & lane_prefix_mask));
}
finish_cb(row, lane);
}


template <int subwarp_size, typename IndexType, typename ValueType>
__global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
remove_complex<ValueType> threshold, IndexType *nnz)
template <int subwarp_size, typename Predicate, typename IndexType>
__device__ void abstract_filter_nnz(const IndexType *row_ptrs,
IndexType num_rows, Predicate pred,
IndexType *nnz)
{
IndexType count{};
abstract_threshold_filter<subwarp_size>(
row_ptrs, vals, num_rows, threshold, [&](IndexType) { count = 0; },
[&](IndexType, IndexType, ValueType, bool, IndexType warp_count,
IndexType) { count += warp_count; },
abstract_filter_impl<subwarp_size>(
row_ptrs, num_rows, pred, [&](IndexType) { count = 0; },
[&](IndexType, IndexType, bool, IndexType warp_count, IndexType) {
count += warp_count;
},
[&](IndexType row, IndexType lane) {
if (row < num_rows && lane == 0) {
nnz[row] = count;
Expand All @@ -88,32 +84,96 @@ __global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
}


template <int subwarp_size, typename IndexType, typename ValueType>
__global__ __launch_bounds__(default_block_size) void threshold_filter(
const IndexType *old_row_ptrs, const IndexType *old_col_idxs,
const ValueType *old_vals, IndexType num_rows,
remove_complex<ValueType> threshold, const IndexType *new_row_ptrs,
IndexType *new_row_idxs, IndexType *new_col_idxs, ValueType *new_vals)
template <int subwarp_size, typename Predicate, typename IndexType,
typename ValueType>
__device__ void abstract_filter(const IndexType *old_row_ptrs,
const IndexType *old_col_idxs,
const ValueType *old_vals, IndexType num_rows,
Predicate pred, const IndexType *new_row_ptrs,
IndexType *new_row_idxs,
IndexType *new_col_idxs, ValueType *new_vals)
{
IndexType count{};
IndexType new_offset{};
abstract_threshold_filter<subwarp_size>(
old_row_ptrs, old_vals, num_rows, threshold,
abstract_filter_impl<subwarp_size>(
old_row_ptrs, num_rows, pred,
[&](IndexType row) {
new_offset = new_row_ptrs[row];
count = 0;
},
[&](IndexType row, IndexType idx, ValueType val, bool keep,
IndexType warp_count, IndexType warp_prefix_sum) {
[&](IndexType row, IndexType idx, bool keep, IndexType warp_count,
IndexType warp_prefix_sum) {
if (keep) {
auto new_idx = new_offset + warp_prefix_sum + count;
new_row_idxs[new_idx] = row;
new_col_idxs[new_idx] = old_col_idxs[idx];
new_vals[new_idx] = val;
new_vals[new_idx] = old_vals[idx];
}
count += warp_count;
},
[](IndexType, IndexType) {});
}


template <int subwarp_size, typename IndexType, typename ValueType>
__global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
remove_complex<ValueType> threshold, IndexType *nnz)
{
abstract_filter_nnz<subwarp_size>(
row_ptrs, num_rows,
[&](IndexType idx, IndexType row_begin, IndexType row_end) {
return abs(vals[idx]) >= threshold || idx == row_end - 1;
},
nnz);
}


template <int subwarp_size, typename IndexType, typename ValueType>
__global__ __launch_bounds__(default_block_size) void threshold_filter(
const IndexType *old_row_ptrs, const IndexType *old_col_idxs,
const ValueType *old_vals, IndexType num_rows,
remove_complex<ValueType> threshold, const IndexType *new_row_ptrs,
IndexType *new_row_idxs, IndexType *new_col_idxs, ValueType *new_vals)
{
abstract_filter<subwarp_size>(
old_row_ptrs, old_col_idxs, old_vals, num_rows,
[&](IndexType idx, IndexType row_begin, IndexType row_end) {
return abs(old_vals[idx]) >= threshold || idx == row_end - 1;
},
new_row_ptrs, new_row_idxs, new_col_idxs, new_vals);
}


template <int subwarp_size, typename IndexType, typename BucketType>
__global__ __launch_bounds__(default_block_size) void bucket_filter_nnz(
const IndexType *row_ptrs, const BucketType *buckets, IndexType num_rows,
BucketType bucket, IndexType *nnz)
{
abstract_filter_nnz<subwarp_size>(
row_ptrs, num_rows,
[&](IndexType idx, IndexType row_begin, IndexType row_end) {
return buckets[idx] >= bucket || idx == row_end - 1;
},
nnz);
}


template <int subwarp_size, typename IndexType, typename ValueType,
typename BucketType>
__global__ __launch_bounds__(default_block_size) void bucket_filter(
const IndexType *old_row_ptrs, const IndexType *old_col_idxs,
const ValueType *old_vals, const BucketType *buckets, IndexType num_rows,
BucketType bucket, const IndexType *new_row_ptrs, IndexType *new_row_idxs,
IndexType *new_col_idxs, ValueType *new_vals)
{
abstract_filter<subwarp_size>(
old_row_ptrs, old_col_idxs, old_vals, num_rows,
[&](IndexType idx, IndexType row_begin, IndexType row_end) {
return buckets[idx] >= bucket || idx == row_end - 1;
},
new_row_ptrs, new_row_idxs, new_col_idxs, new_vals);
}


} // namespace kernel
5 changes: 3 additions & 2 deletions common/factorization/par_ilut_select_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
using AbsType = remove_complex<ValueType>;
auto idx = threadIdx.x;
AbsType samples[oversampling_factor];
auto stride = size / sample_size;
// assuming rounding towards zero
auto stride = double(size) / sample_size;
#pragma unroll
for (auto i = 0; i < oversampling_factor; ++i) {
auto lidx = idx * oversampling_factor + i;
auto val = input[IndexType(lidx * stride)];
auto val = input[static_cast<IndexType>(lidx * stride)];
samples[i] = abs(val);
}
__shared__ AbsType sh_samples[sample_size];
Expand Down
2 changes: 1 addition & 1 deletion common/factorization/par_ilut_spgeam_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
return;
}

auto lane = IndexType(group.thread_rank());
auto lane = static_cast<IndexType>(group.thread_rank());
auto lanemask_eq = config::lane_mask_type{1} << lane;
auto lanemask_lt = lanemask_eq - 1;

Expand Down
7 changes: 4 additions & 3 deletions core/factorization/par_ilut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,11 @@ void ParIlutState<ValueType, IndexType>::iterate()

if (use_approx_select) {
// remove approximately smallest candidates
exec->run(make_threshold_filter_approx(l_new.get(), l_nnz_limit,
l.get(), l_coo.get()));
exec->run(make_threshold_filter_approx(
u_new_csc.get(), u_nnz_limit, u_csc.get(), u_transp_coo.get()));
l_new.get(), l_nnz_limit, selection_tmp, l.get(), l_coo.get()));
exec->run(make_threshold_filter_approx(u_new_csc.get(), u_nnz_limit,
selection_tmp, u_csc.get(),
u_transp_coo.get()));
} else {
// select threshold to remove smallest candidates
remove_complex<ValueType> l_threshold{};
Expand Down
2 changes: 1 addition & 1 deletion core/factorization/par_ilut_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ namespace kernels {
IndexType) \
void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<ValueType, IndexType> *m, \
IndexType rank, \
IndexType rank, Array<ValueType> &tmp, \
matrix::Csr<ValueType, IndexType> *m_out, \
matrix::Coo<ValueType, IndexType> *m_out_coo)

Expand Down
12 changes: 7 additions & 5 deletions cuda/factorization/par_ilut_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ void ssss_count(const ValueType *values, IndexType size,
{
constexpr auto bucket_count = kernel::searchtree_width;
auto num_threads_total = ceildiv(size, items_per_thread);
auto num_blocks = IndexType(ceildiv(num_threads_total, default_block_size));
auto num_blocks =
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
// pick sample, build searchtree
kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size,
tree);
Expand All @@ -103,7 +104,8 @@ void ssss_filter(const ValueType *values, IndexType size,
IndexType bucket, remove_complex<ValueType> *out)
{
auto num_threads_total = ceildiv(size, items_per_thread);
auto num_blocks = IndexType(ceildiv(num_threads_total, default_block_size));
auto num_blocks =
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
kernel::filter_bucket<<<num_blocks, default_block_size>>>(
as_cuda_type(values), size, bucket, oracles, partial_counts, out,
items_per_thread);
Expand Down Expand Up @@ -158,7 +160,7 @@ void threshold_select(std::shared_ptr<const CudaExecutor> exec,
auto splitter_ranks = splitter_ranks_array.get_const_data();
auto it = std::upper_bound(splitter_ranks,
splitter_ranks + bucket_count + 1, rank);
auto bucket = IndexType(std::distance(splitter_ranks + 1, it));
auto bucket = static_cast<IndexType>(std::distance(splitter_ranks + 1, it));
auto bucket_size = splitter_ranks[bucket + 1] - splitter_ranks[bucket];
rank -= splitter_ranks[bucket];

Expand All @@ -184,7 +186,7 @@ void threshold_select(std::shared_ptr<const CudaExecutor> exec,
splitter_ranks = splitter_ranks_array.get_const_data();
auto it = std::upper_bound(splitter_ranks,
splitter_ranks + bucket_count + 1, rank);
bucket = IndexType(std::distance(splitter_ranks + 1, it));
bucket = static_cast<IndexType>(std::distance(splitter_ranks + 1, it));
ssss_filter(tmp_in, bucket_size, oracles, partial_counts, bucket,
tmp_out);

Expand Down Expand Up @@ -229,7 +231,7 @@ void threshold_filter(std::shared_ptr<const CudaExecutor> exec,
auto old_col_idxs = a->get_const_col_idxs();
auto old_vals = a->get_const_values();
// compute nnz for each row
auto num_rows = IndexType(a->get_size()[0]);
auto num_rows = static_cast<IndexType>(a->get_size()[0]);
auto block_size = default_block_size / config::warp_size;
auto num_blocks = ceildiv(num_rows, block_size);
auto new_row_ptrs = m_out->get_row_ptrs();
Expand Down
1 change: 1 addition & 0 deletions hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ set(GINKGO_HIP_SOURCES
components/zero_array.hip.cpp
factorization/par_ilu_kernels.hip.cpp
factorization/par_ilut_select_kernel.hip.cpp
factorization/par_ilut_approx_filter_kernel.hip.cpp
factorization/par_ilut_filter_kernel.hip.cpp
factorization/par_ilut_spgeam_kernel.hip.cpp
factorization/par_ilut_sweep_kernel.hip.cpp
Expand Down
Loading

0 comments on commit b5ab36d

Please sign in to comment.