finish candidate add, add cuda kernels

ginkgo-project · Feb 10, 2020 · a51e4b2 · a51e4b2
1 parent 42fc630
commit a51e4b2
Show file tree

Hide file tree

Showing 10 changed files with 1,008 additions and 213 deletions.
diff --git a/common/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/factorization/par_ilut_spgeam_kernels.hpp.inc
diff --git a/common/factorization/par_ilut_sweep_kernels.hpp.inc b/common/factorization/par_ilut_sweep_kernels.hpp.inc
@@ -67,20 +67,29 @@ __global__ __launch_bounds__(default_block_size) void sweep(
     ValueType last_product{};
     IndexType l_out_idx{};
     IndexType u_out_idx{};
-    group_merge<subwarp_size>(l_col_idxs, l_row_begin, l_row_size, u_row_idxs,
-                              u_col_begin, u_col_size, subwarp,
-                              [&](IndexType l_idx, ValueType l_col,
-                                  IndexType u_idx, ValueType u_row, IndexType) {
-                                  if (l_col == u_row) {
-                                      l_out_idx = l_idx;
-                                      u_out_idx = u_idx;
-                                      last_product =
-                                          l_vals[l_idx] * u_vals[u_idx];
-                                      sum += last_product;
-                                  }
-                              });
+    config::lane_mask_type match_mask{};
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, u_row_idxs + u_col_begin,
+        u_col_size, subwarp,
+        [&](IndexType l_idx, ValueType l_col, IndexType u_idx, ValueType u_row,
+            IndexType, bool valid) {
+            auto new_mask = subwarp.ballot(l_col == u_row && valid);
+            match_mask = new_mask ? new_mask : match_mask;
+            if (l_col == u_row) {
+                l_out_idx = l_idx;
+                u_out_idx = u_idx;
+                last_product = l_vals[l_idx] * u_vals[u_idx];
+                sum += last_product;
+            }
+        });
     // revert last operation
-    sum -= last_product;
+    auto last_thread = config::warp_size - 1 - clz(match_mask);
+    if (subwarp.thread_rank() == last_thread) {
+        sum -= last_product;
+    }
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
     if (row > col) {
         auto to_write = sum / u_vals[u_col_ptrs[col + 1] - 1];
         if (::gko::isfinite(to_write)) {

diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
@@ -65,7 +65,11 @@ target_sources(ginkgo_cuda
         components/prefix_sum.cu
         components/zero_array.cu
         factorization/par_ilu_kernels.cu
-        factorization/par_ilut_kernels.cu
+        factorization/par_ilut_approx_filter_kernel.cu
+        factorization/par_ilut_filter_kernel.cu
+        factorization/par_ilut_select_kernel.cu
+        factorization/par_ilut_spgeam_kernel.cu
+        factorization/par_ilut_sweep_kernel.cu
         matrix/coo_kernels.cu
         matrix/csr_kernels.cu
         matrix/dense_kernels.cu

diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu
@@ -0,0 +1,237 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/components/prefix_sum.hpp"
+#include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/atomic.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/intrinsics.cuh"
+#include "cuda/components/prefix_sum.cuh"
+#include "cuda/components/sorting.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr auto default_block_size = 512;
+constexpr auto items_per_thread = 2;
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/factorization/par_ilut_filter_kernels.hpp.inc"
+#include "common/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void ssss_count(const ValueType *values, IndexType size,
+                remove_complex<ValueType> *tree, unsigned char *oracles,
+                IndexType *partial_counts, IndexType *total_counts)
+{
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    // pick sample, build searchtree
+    kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size,
+                                                  tree);
+    // determine bucket sizes
+    kernel::count_buckets<<<num_blocks, default_block_size>>>(
+        as_cuda_type(values), size, tree, partial_counts, oracles,
+        items_per_thread);
+    // compute prefix sum and total sum over block-local values
+    kernel::block_prefix_sum<<<bucket_count, default_block_size>>>(
+        partial_counts, total_counts, num_blocks);
+    // compute prefix sum over bucket counts
+    start_prefix_sum<bucket_count><<<1, bucket_count>>>(
+        bucket_count, total_counts, total_counts + bucket_count);
+}
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter_approx(syn::value_list<int, subwarp_size>,
+                             std::shared_ptr<const CudaExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> *tmp,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp->resize_and_reset(tmp_size);
+
+    auto total_counts = reinterpret_cast<IndexType *>(tmp->get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType *>(tmp->get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char *>(
+        tmp->get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType *>(tmp->get_data() + tmp_size_totals +
+                                    tmp_size_partials + tmp_size_oracles);
+
+    ssss_count(values, size, tree, oracles, partial_counts, total_counts);
+
+    // determine bucket with correct rank
+    auto total_counts_array =
+        Array<IndexType>::view(exec, bucket_count + 1, total_counts);
+    Array<IndexType> splitter_ranks_array(exec->get_master(),
+                                          total_counts_array);
+    auto splitter_ranks = splitter_ranks_array.get_const_data();
+    auto it = std::upper_bound(splitter_ranks,
+                               splitter_ranks + bucket_count + 1, rank);
+    auto bucket =
+        static_cast<unsigned char>(std::distance(splitter_ranks + 1, it));
+
+    // filter the elements
+    auto old_row_ptrs = m->get_const_row_ptrs();
+    auto old_col_idxs = m->get_const_col_idxs();
+    auto old_vals = m->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(m->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    kernel::bucket_filter_nnz<subwarp_size><<<num_blocks, default_block_size>>>(
+        old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs);
+
+    // build row pointers
+    prefix_sum(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    IndexType new_nnz{};
+    exec->get_master()->copy_from(exec.get(), 1, new_row_ptrs + num_rows,
+                                  &new_nnz);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+    coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+    coo_builder.get_col_idx_array() =
+        Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+    coo_builder.get_value_array() =
+        Array<ValueType>::view(exec, new_nnz, new_vals);
+    auto new_row_idxs = m_out_coo->get_row_idxs();
+    kernel::bucket_filter<subwarp_size><<<num_blocks, default_block_size>>>(
+        old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), oracles, num_rows,
+        bucket, new_row_ptrs, new_row_idxs, new_col_idxs,
+        as_cuda_type(new_vals));
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx,
+                                    threshold_filter_approx);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void threshold_filter_approx(std::shared_ptr<const CudaExecutor> exec,
+                             const matrix::Csr<ValueType, IndexType> *m,
+                             IndexType rank, Array<ValueType> &tmp,
+                             matrix::Csr<ValueType, IndexType> *m_out,
+                             matrix::Coo<ValueType, IndexType> *m_out_coo)
+{
+    auto num_rows = m->get_size()[0];
+    auto total_nnz = m->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter_approx(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, m, rank, &tmp, m_out,
+        m_out_coo);
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko