add approx selection, reference kernels

ginkgo-project · Feb 9, 2020 · b5ab36d · b5ab36d
1 parent da56899
commit b5ab36d
Show file tree

Hide file tree

Showing 16 changed files with 865 additions and 64 deletions.
diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc
@@ -152,7 +152,7 @@ __device__ void group_merge(const ValueType *a, IndexType a_size,
     IndexType a_begin{};
     IndexType b_begin{};
     IndexType c_begin{};
-    auto lane = IndexType(group.thread_rank());
+    auto lane = static_cast<IndexType>(group.thread_rank());
     auto a_cur = detail::checked_load(a, a_begin + lane, a_size);
     auto b_cur = detail::checked_load(b, b_begin + lane, a_size);
     while (c_begin < c_size) {

diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -33,13 +33,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace kernel {
 
 
-template <int subwarp_size, typename IndexType, typename ValueType,
+template <int subwarp_size, typename IndexType, typename Predicate,
           typename BeginCallback, typename StepCallback,
           typename FinishCallback>
-__device__ void abstract_threshold_filter(
-    const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
-    remove_complex<ValueType> threshold, BeginCallback begin_cb,
-    StepCallback step_cb, FinishCallback finish_cb)
+__device__ void abstract_filter_impl(const IndexType *row_ptrs,
+                                     IndexType num_rows, Predicate pred,
+                                     BeginCallback begin_cb,
+                                     StepCallback step_cb,
+                                     FinishCallback finish_cb)
 {
     auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
     auto subwarp = group::thread_block_tile<subwarp_size>();
@@ -50,36 +51,31 @@ __device__ void abstract_threshold_filter(
         return;
     }
 
-    ValueType val{};
     auto begin = row_ptrs[row];
     auto end = row_ptrs[row + 1];
     begin_cb(row);
-    auto diag_idx = end - 1;
     auto num_steps = ceildiv(end - begin, config::warp_size);
     for (auto step = 0; step < num_steps; ++step) {
         auto idx = begin + lane + step * config::warp_size;
-        if (idx < end) {
-            val = vals[idx];
-        }
-        auto keep = idx < end && (abs(val) >= threshold || idx == diag_idx);
+        auto keep = idx < end && pred(idx, begin, end);
         auto mask = subwarp.ballot(keep);
-        step_cb(row, idx, val, keep, popcnt(mask),
-                popcnt(mask & lane_prefix_mask));
+        step_cb(row, idx, keep, popcnt(mask), popcnt(mask & lane_prefix_mask));
     }
     finish_cb(row, lane);
 }
 
 
-template <int subwarp_size, typename IndexType, typename ValueType>
-__global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
-    const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
-    remove_complex<ValueType> threshold, IndexType *nnz)
+template <int subwarp_size, typename Predicate, typename IndexType>
+__device__ void abstract_filter_nnz(const IndexType *row_ptrs,
+                                    IndexType num_rows, Predicate pred,
+                                    IndexType *nnz)
 {
     IndexType count{};
-    abstract_threshold_filter<subwarp_size>(
-        row_ptrs, vals, num_rows, threshold, [&](IndexType) { count = 0; },
-        [&](IndexType, IndexType, ValueType, bool, IndexType warp_count,
-            IndexType) { count += warp_count; },
+    abstract_filter_impl<subwarp_size>(
+        row_ptrs, num_rows, pred, [&](IndexType) { count = 0; },
+        [&](IndexType, IndexType, bool, IndexType warp_count, IndexType) {
+            count += warp_count;
+        },
         [&](IndexType row, IndexType lane) {
             if (row < num_rows && lane == 0) {
                 nnz[row] = count;
@@ -88,32 +84,96 @@ __global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
 }
 
 
-template <int subwarp_size, typename IndexType, typename ValueType>
-__global__ __launch_bounds__(default_block_size) void threshold_filter(
-    const IndexType *old_row_ptrs, const IndexType *old_col_idxs,
-    const ValueType *old_vals, IndexType num_rows,
-    remove_complex<ValueType> threshold, const IndexType *new_row_ptrs,
-    IndexType *new_row_idxs, IndexType *new_col_idxs, ValueType *new_vals)
+template <int subwarp_size, typename Predicate, typename IndexType,
+          typename ValueType>
+__device__ void abstract_filter(const IndexType *old_row_ptrs,
+                                const IndexType *old_col_idxs,
+                                const ValueType *old_vals, IndexType num_rows,
+                                Predicate pred, const IndexType *new_row_ptrs,
+                                IndexType *new_row_idxs,
+                                IndexType *new_col_idxs, ValueType *new_vals)
 {
     IndexType count{};
     IndexType new_offset{};
-    abstract_threshold_filter<subwarp_size>(
-        old_row_ptrs, old_vals, num_rows, threshold,
+    abstract_filter_impl<subwarp_size>(
+        old_row_ptrs, num_rows, pred,
         [&](IndexType row) {
             new_offset = new_row_ptrs[row];
             count = 0;
         },
-        [&](IndexType row, IndexType idx, ValueType val, bool keep,
-            IndexType warp_count, IndexType warp_prefix_sum) {
+        [&](IndexType row, IndexType idx, bool keep, IndexType warp_count,
+            IndexType warp_prefix_sum) {
             if (keep) {
                 auto new_idx = new_offset + warp_prefix_sum + count;
                 new_row_idxs[new_idx] = row;
                 new_col_idxs[new_idx] = old_col_idxs[idx];
-                new_vals[new_idx] = val;
+                new_vals[new_idx] = old_vals[idx];
             }
             count += warp_count;
         },
         [](IndexType, IndexType) {});
 }
 
+
+template <int subwarp_size, typename IndexType, typename ValueType>
+__global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
+    const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
+    remove_complex<ValueType> threshold, IndexType *nnz)
+{
+    abstract_filter_nnz<subwarp_size>(
+        row_ptrs, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            return abs(vals[idx]) >= threshold || idx == row_end - 1;
+        },
+        nnz);
+}
+
+
+template <int subwarp_size, typename IndexType, typename ValueType>
+__global__ __launch_bounds__(default_block_size) void threshold_filter(
+    const IndexType *old_row_ptrs, const IndexType *old_col_idxs,
+    const ValueType *old_vals, IndexType num_rows,
+    remove_complex<ValueType> threshold, const IndexType *new_row_ptrs,
+    IndexType *new_row_idxs, IndexType *new_col_idxs, ValueType *new_vals)
+{
+    abstract_filter<subwarp_size>(
+        old_row_ptrs, old_col_idxs, old_vals, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            return abs(old_vals[idx]) >= threshold || idx == row_end - 1;
+        },
+        new_row_ptrs, new_row_idxs, new_col_idxs, new_vals);
+}
+
+
+template <int subwarp_size, typename IndexType, typename BucketType>
+__global__ __launch_bounds__(default_block_size) void bucket_filter_nnz(
+    const IndexType *row_ptrs, const BucketType *buckets, IndexType num_rows,
+    BucketType bucket, IndexType *nnz)
+{
+    abstract_filter_nnz<subwarp_size>(
+        row_ptrs, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            return buckets[idx] >= bucket || idx == row_end - 1;
+        },
+        nnz);
+}
+
+
+template <int subwarp_size, typename IndexType, typename ValueType,
+          typename BucketType>
+__global__ __launch_bounds__(default_block_size) void bucket_filter(
+    const IndexType *old_row_ptrs, const IndexType *old_col_idxs,
+    const ValueType *old_vals, const BucketType *buckets, IndexType num_rows,
+    BucketType bucket, const IndexType *new_row_ptrs, IndexType *new_row_idxs,
+    IndexType *new_col_idxs, ValueType *new_vals)
+{
+    abstract_filter<subwarp_size>(
+        old_row_ptrs, old_col_idxs, old_vals, num_rows,
+        [&](IndexType idx, IndexType row_begin, IndexType row_end) {
+            return buckets[idx] >= bucket || idx == row_end - 1;
+        },
+        new_row_ptrs, new_row_idxs, new_col_idxs, new_vals);
+}
+
+
 }  // namespace kernel
diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -55,11 +55,12 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
     using AbsType = remove_complex<ValueType>;
     auto idx = threadIdx.x;
     AbsType samples[oversampling_factor];
-    auto stride = size / sample_size;
+    // assuming rounding towards zero
+    auto stride = double(size) / sample_size;
 #pragma unroll
     for (auto i = 0; i < oversampling_factor; ++i) {
         auto lidx = idx * oversampling_factor + i;
-        auto val = input[IndexType(lidx * stride)];
+        auto val = input[static_cast<IndexType>(lidx * stride)];
         samples[i] = abs(val);
     }
     __shared__ AbsType sh_samples[sample_size];

diff --git a/common/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/factorization/par_ilut_spgeam_kernels.hpp.inc
@@ -120,7 +120,7 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
         return;
     }
 
-    auto lane = IndexType(group.thread_rank());
+    auto lane = static_cast<IndexType>(group.thread_rank());
     auto lanemask_eq = config::lane_mask_type{1} << lane;
     auto lanemask_lt = lanemask_eq - 1;
 

diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
@@ -277,10 +277,11 @@ void ParIlutState<ValueType, IndexType>::iterate()
 
     if (use_approx_select) {
         // remove approximately smallest candidates
-        exec->run(make_threshold_filter_approx(l_new.get(), l_nnz_limit,
-                                               l.get(), l_coo.get()));
         exec->run(make_threshold_filter_approx(
-            u_new_csc.get(), u_nnz_limit, u_csc.get(), u_transp_coo.get()));
+            l_new.get(), l_nnz_limit, selection_tmp, l.get(), l_coo.get()));
+        exec->run(make_threshold_filter_approx(u_new_csc.get(), u_nnz_limit,
+                                               selection_tmp, u_csc.get(),
+                                               u_transp_coo.get()));
     } else {
         // select threshold to remove smallest candidates
         remove_complex<ValueType> l_threshold{};

diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp
@@ -81,7 +81,7 @@ namespace kernels {
                                                             IndexType)        \
     void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec, \
                                  const matrix::Csr<ValueType, IndexType> *m,  \
-                                 IndexType rank,                              \
+                                 IndexType rank, Array<ValueType> &tmp,       \
                                  matrix::Csr<ValueType, IndexType> *m_out,    \
                                  matrix::Coo<ValueType, IndexType> *m_out_coo)
 

diff --git a/cuda/factorization/par_ilut_kernels.cu b/cuda/factorization/par_ilut_kernels.cu
@@ -80,7 +80,8 @@ void ssss_count(const ValueType *values, IndexType size,
 {
     constexpr auto bucket_count = kernel::searchtree_width;
     auto num_threads_total = ceildiv(size, items_per_thread);
-    auto num_blocks = IndexType(ceildiv(num_threads_total, default_block_size));
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
     // pick sample, build searchtree
     kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size,
                                                   tree);
@@ -103,7 +104,8 @@ void ssss_filter(const ValueType *values, IndexType size,
                  IndexType bucket, remove_complex<ValueType> *out)
 {
     auto num_threads_total = ceildiv(size, items_per_thread);
-    auto num_blocks = IndexType(ceildiv(num_threads_total, default_block_size));
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
     kernel::filter_bucket<<<num_blocks, default_block_size>>>(
         as_cuda_type(values), size, bucket, oracles, partial_counts, out,
         items_per_thread);
@@ -158,7 +160,7 @@ void threshold_select(std::shared_ptr<const CudaExecutor> exec,
     auto splitter_ranks = splitter_ranks_array.get_const_data();
     auto it = std::upper_bound(splitter_ranks,
                                splitter_ranks + bucket_count + 1, rank);
-    auto bucket = IndexType(std::distance(splitter_ranks + 1, it));
+    auto bucket = static_cast<IndexType>(std::distance(splitter_ranks + 1, it));
     auto bucket_size = splitter_ranks[bucket + 1] - splitter_ranks[bucket];
     rank -= splitter_ranks[bucket];
 
@@ -184,7 +186,7 @@ void threshold_select(std::shared_ptr<const CudaExecutor> exec,
         splitter_ranks = splitter_ranks_array.get_const_data();
         auto it = std::upper_bound(splitter_ranks,
                                    splitter_ranks + bucket_count + 1, rank);
-        bucket = IndexType(std::distance(splitter_ranks + 1, it));
+        bucket = static_cast<IndexType>(std::distance(splitter_ranks + 1, it));
         ssss_filter(tmp_in, bucket_size, oracles, partial_counts, bucket,
                     tmp_out);
 
@@ -229,7 +231,7 @@ void threshold_filter(std::shared_ptr<const CudaExecutor> exec,
     auto old_col_idxs = a->get_const_col_idxs();
     auto old_vals = a->get_const_values();
     // compute nnz for each row
-    auto num_rows = IndexType(a->get_size()[0]);
+    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
     auto block_size = default_block_size / config::warp_size;
     auto num_blocks = ceildiv(num_rows, block_size);
     auto new_row_ptrs = m_out->get_row_ptrs();

diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
@@ -98,6 +98,7 @@ set(GINKGO_HIP_SOURCES
     components/zero_array.hip.cpp
     factorization/par_ilu_kernels.hip.cpp
     factorization/par_ilut_select_kernel.hip.cpp
+    factorization/par_ilut_approx_filter_kernel.hip.cpp
     factorization/par_ilut_filter_kernel.hip.cpp
     factorization/par_ilut_spgeam_kernel.hip.cpp
     factorization/par_ilut_sweep_kernel.hip.cpp