Merge add CUDA, HIP and OMP kernels for ISAI

This PR adds ISAI kernels for the executors CUDA, HIP and OpenMP. Related PR: #502
ginkgo-project · Apr 24, 2020 · 90db75d · 90db75d
2 parents 8a7ece4 + 845853d
commit 90db75d
Show file tree

Hide file tree

Showing 12 changed files with 932 additions and 6 deletions.
diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * Stores an array with uninitialized contents.
  *
+ * This class needed for datatypes that do have a non-empty constructor when`
+ * using them as shared memory, for example `thrust::complex<float>`.
+ *
  * @tparam ValueType the type of values
  * @tparam size the size of the array
  */

diff --git a/common/preconditioner/isai_kernels.hpp.inc b/common/preconditioner/isai_kernels.hpp.inc
@@ -29,3 +29,185 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+/**
+ * @internal
+ *
+ * This kernel supports at most `subwarp_size` (< `warp_size`) elements per row.
+ * If there are more elements, they are simply ignored. Only the first
+ * `subwarp_size` elements are considered both for the values and for the
+ * sparsity pattern.
+ */
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType, typename Callable>
+__forceinline__ __device__ void generic_generate(
+    IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
+    const IndexType *__restrict__ m_col_idxs,
+    const ValueType *__restrict__ m_values, IndexType *__restrict__ i_row_ptrs,
+    IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values,
+    Callable trs_solve)
+{
+    static_assert((subwarp_size & (subwarp_size - 1)) == 0,
+                  "subwarp_size must be a power of 2");
+
+    const auto gidx = thread::get_thread_id_flat<IndexType>();
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    const int tidx = threadIdx.x;
+
+    auto this_block = group::this_thread_block();
+    auto subwarp = group::tiled_partition<subwarp_size>(this_block);
+    const int local_id = subwarp.thread_rank();
+
+    __shared__ UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                                 subwarps_per_block>
+        trisystem;
+
+    const int end_trisystem = blockDim.x * subwarp_size;
+    const int trisystem_start =
+        (tidx / subwarp_size) * subwarp_size * subwarp_size;
+
+    // Allow at most subwarp_size * subwarp_size elements in a subwarp trisystem
+    for (int i = 0; i < subwarp_size; ++i) {
+        trisystem[trisystem_start + local_id + i * subwarp_size] =
+            zero<ValueType>();
+    }
+    this_block.sync();
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    const auto i_row_begin = i_row_ptrs[row];
+    const auto i_row_end = i_row_ptrs[row + 1] - i_row_begin < subwarp_size
+                               ? i_row_ptrs[row + 1]
+                               : i_row_begin + subwarp_size;
+    const auto num_row_elems = i_row_end - i_row_begin;
+
+    if (local_id < num_row_elems) {
+        const auto col = i_col_idxs[i_row_begin + local_id];
+        const auto m_row_end = m_row_ptrs[col + 1];
+        auto m_row_ptr = m_row_ptrs[col];
+        auto i_row_ptr = i_row_begin;
+        int idx{trisystem_start + local_id};  // write to trisystem in row major
+
+        while (m_row_ptr < m_row_end && i_row_ptr < i_row_end &&
+               idx < end_trisystem) {
+            const auto sparsity_col = i_col_idxs[i_row_ptr];
+            const auto m_col = m_col_idxs[m_row_ptr];
+            if (sparsity_col == m_col) {
+                trisystem[idx] = m_values[m_row_ptr];
+                idx += subwarp_size;
+                ++m_row_ptr;
+                ++i_row_ptr;
+            } else if (m_col < sparsity_col) {
+                ++m_row_ptr;
+            } else {
+                ++i_row_ptr;
+                idx += subwarp_size;
+            }
+        }
+    }
+
+    this_block.sync();
+
+    // Now, read a full col of `trisystem` into local registers, which will be
+    // row elements after this (implicit) transpose
+    ValueType local_row[subwarp_size];
+    const auto trisystem_col_start = trisystem_start + local_id * subwarp_size;
+    for (int i = 0; i < subwarp_size; ++i) {
+        const auto intermediate_idx = local_id + i;
+        const auto coalesced_idx = intermediate_idx % subwarp_size;
+        local_row[coalesced_idx] =
+            trisystem[trisystem_col_start + coalesced_idx];
+    }
+
+    const auto rhs = trs_solve(num_row_elems, local_row, subwarp);
+
+    // Write back:
+    if (local_id < num_row_elems) {
+        const auto idx = i_row_begin + local_id;
+        if (gko::isfinite(rhs)) {
+            i_values[idx] = rhs;
+        } else {
+            i_values[idx] =
+                i_col_idxs[idx] == row ? one<ValueType>() : zero<ValueType>();
+        }
+    }
+}
+
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+__global__
+__launch_bounds__(subwarp_size *subwarps_per_block) void generate_l_inverse(
+    IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
+    const IndexType *__restrict__ m_col_idxs,
+    const ValueType *__restrict__ m_values, IndexType *__restrict__ i_row_ptrs,
+    IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values)
+{
+    auto trs_solve = [](IndexType num_elems,
+                        const ValueType *__restrict__ local_row,
+                        group::thread_block_tile<subwarp_size> &subwarp) {
+        const int local_id = subwarp.thread_rank();
+        ValueType rhs =
+            local_id == num_elems - 1 ? one<ValueType>() : zero<ValueType>();
+        // Solve Triangular system
+        for (int d_col = num_elems - 1; d_col >= 0; --d_col) {
+            const auto elem = local_row[d_col];
+            if (d_col == local_id) {
+                rhs /= elem;
+            }
+
+            const ValueType bot = subwarp.shfl(rhs, d_col);
+            if (local_id < d_col) {
+                rhs -= bot * elem;
+            }
+        }
+
+        return rhs;
+    };
+    generic_generate<subwarp_size, subwarps_per_block>(
+        num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
+        i_values, trs_solve);
+}
+
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+__global__
+__launch_bounds__(subwarp_size *subwarps_per_block) void generate_u_inverse(
+    IndexType num_rows, const IndexType *__restrict__ m_row_ptrs,
+    const IndexType *__restrict__ m_col_idxs,
+    const ValueType *__restrict__ m_values, IndexType *__restrict__ i_row_ptrs,
+    IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values)
+{
+    auto trs_solve = [](IndexType num_elems,
+                        const ValueType *__restrict__ local_row,
+                        group::thread_block_tile<subwarp_size> &subwarp) {
+        const int local_id = subwarp.thread_rank();
+        ValueType rhs = local_id == 0 ? one<ValueType>() : zero<ValueType>();
+        // Solve Triangular system
+        for (int d_col = 0; d_col < num_elems; ++d_col) {
+            const auto elem = local_row[d_col];
+            if (d_col == local_id) {
+                rhs /= elem;
+            }
+
+            const ValueType top = subwarp.shfl(rhs, d_col);
+            if (d_col < local_id) {
+                rhs -= top * elem;
+            }
+        }
+
+        return rhs;
+    };
+    generic_generate<subwarp_size, subwarps_per_block>(
+        num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
+        i_values, trs_solve);
+}
+
+
+}  // namespace kernel
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
@@ -34,11 +34,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "core/matrix/csr_builder.hpp"
 #include "cuda/base/config.hpp"
+#include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
@@ -59,11 +65,36 @@ namespace {
 }  // namespace
 
 
+constexpr int subwarp_size{config::warp_size};
+constexpr int subwarps_per_block{2};
+constexpr int default_block_size{subwarps_per_block * subwarp_size};
+
+
 template <typename ValueType, typename IndexType>
 void generate_l_inverse(std::shared_ptr<const DefaultExecutor> exec,
                         const matrix::Csr<ValueType, IndexType> *l_csr,
                         matrix::Csr<ValueType, IndexType> *inverse_l)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto nnz = l_csr->get_num_stored_elements();
+    const auto num_rows = l_csr->get_size()[0];
+
+    exec->copy_from(exec.get(), nnz, l_csr->get_const_col_idxs(),
+                    inverse_l->get_col_idxs());
+    exec->copy_from(exec.get(), num_rows + 1, l_csr->get_const_row_ptrs(),
+                    inverse_l->get_row_ptrs());
+
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / config::warp_size), 1, 1);
+    kernel::generate_l_inverse<subwarp_size, subwarps_per_block>
+        <<<grid, block>>>(
+            static_cast<IndexType>(num_rows), l_csr->get_const_row_ptrs(),
+            l_csr->get_const_col_idxs(),
+            as_cuda_type(l_csr->get_const_values()), inverse_l->get_row_ptrs(),
+            inverse_l->get_col_idxs(), as_cuda_type(inverse_l->get_values()));
+    // Call make_srow()
+    matrix::CsrBuilder<ValueType, IndexType> builder(inverse_l);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_L_INVERSE_KERNEL);
@@ -73,7 +104,27 @@ template <typename ValueType, typename IndexType>
 void generate_u_inverse(std::shared_ptr<const DefaultExecutor> exec,
                         const matrix::Csr<ValueType, IndexType> *u_csr,
                         matrix::Csr<ValueType, IndexType> *inverse_u)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto nnz = u_csr->get_num_stored_elements();
+    const auto num_rows = u_csr->get_size()[0];
+
+    exec->copy_from(exec.get(), nnz, u_csr->get_const_col_idxs(),
+                    inverse_u->get_col_idxs());
+    exec->copy_from(exec.get(), num_rows + 1, u_csr->get_const_row_ptrs(),
+                    inverse_u->get_row_ptrs());
+
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / config::warp_size), 1, 1);
+    kernel::generate_u_inverse<subwarp_size, subwarps_per_block>
+        <<<grid, block>>>(
+            static_cast<IndexType>(num_rows), u_csr->get_const_row_ptrs(),
+            u_csr->get_const_col_idxs(),
+            as_cuda_type(u_csr->get_const_values()), inverse_u->get_row_ptrs(),
+            inverse_u->get_col_idxs(), as_cuda_type(inverse_u->get_values()));
+    // Call make_srow()
+    matrix::CsrBuilder<ValueType, IndexType> builder(inverse_u);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_U_INVERSE_KERNEL);

diff --git a/cuda/test/preconditioner/CMakeLists.txt b/cuda/test/preconditioner/CMakeLists.txt
@@ -1 +1,2 @@
 ginkgo_create_test(jacobi_kernels)
+ginkgo_create_test(isai_kernels)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		ginkgo_create_test(jacobi_kernels)
		ginkgo_create_test(isai_kernels)