From 699b67a4d6e97fd8bcf0928525d598b8b17b2f8e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 15 Oct 2024 14:06:22 -0600 Subject: [PATCH] Block Sptrsv fixes (#2376) * Go back to original impl with vector reduce Signed-off-by: James Foucar * Disable tp1 test for clang+cuda+complex Signed-off-by: James Foucar * Fixes and warn users about broken case Signed-off-by: James Foucar * formatting Signed-off-by: James Foucar * Fix comments and print skip message Signed-off-by: James Foucar --------- Signed-off-by: James Foucar Signed-off-by: Nathan Ellingwood --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 275 ++++++++++++------ .../sparse/src/KokkosSparse_sptrsv_handle.hpp | 8 + .../sparse/unit_test/Test_Sparse_sptrsv.hpp | 9 + 3 files changed, 196 insertions(+), 96 deletions(-) diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 6dbc3039da40..77d790e288dc 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -79,7 +79,7 @@ struct SptrsvWrap { using range_type = Kokkos::pair; // Tag structs - struct UnsortedTag {}; // This doesn't appear to be supported + struct UnsortedTag {}; struct LargerCutoffTag {}; struct UnsortedLargerCutoffTag {}; @@ -115,7 +115,9 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - using reftype = scalar_t &; + using reftype = scalar_t &; + using ArrayType = reftype; + using SumArray = reftype; struct SBlock { template @@ -141,6 +143,16 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION size_type get_block_size() const { return 0; } + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const scalar_t &a, const scalar_t &b, scalar_t &c) { c -= a * b; } + + KOKKOS_INLINE_FUNCTION + static void copy(const member_type &, scalar_t &, const scalar_t &) {} + + KOKKOS_INLINE_FUNCTION + static void copy(scalar_t &, const scalar_t &) {} + // lget KOKKOS_INLINE_FUNCTION scalar_t &lget(const size_type row) const { return lhs(row); } @@ -195,6 +207,60 @@ struct SptrsvWrap { using reftype = Vector; + struct ArrayType { + scalar_t m_data[MAX_VEC_SIZE]; + + KOKKOS_INLINE_FUNCTION + ArrayType() { init(); } + + KOKKOS_INLINE_FUNCTION + ArrayType(const ArrayType &rhs_) { + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = rhs_.m_data[i]; + } + + KOKKOS_INLINE_FUNCTION + ArrayType(const Vector &) { init(); } + + KOKKOS_INLINE_FUNCTION + void init() { + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + ArrayType &operator+=(const ArrayType &rhs_) { + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i]; + return *this; + } + }; + + struct SumArray { + using reducer = SumArray; + using value_type = ArrayType; + using result_view_type = Kokkos::View; + + private: + value_type &m_value; + + public: + KOKKOS_INLINE_FUNCTION + SumArray(value_type &value) : m_value(value) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type &dest, const value_type &src) const { dest += src; } + + KOKKOS_INLINE_FUNCTION + void init(value_type &val) const { val.init(); } + + KOKKOS_INLINE_FUNCTION + value_type &reference() const { return m_value; } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return result_view_type(&m_value, 1); } + + KOKKOS_INLINE_FUNCTION + bool reference_scalar() const { return true; } + }; + RowMapType row_map; EntriesType entries; ValuesType values; @@ -215,6 +281,7 @@ struct SptrsvWrap { block_size(block_size_), block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, "Max supported block size is " << MAX_VEC_SIZE); } KOKKOS_INLINE_FUNCTION @@ -257,17 +324,17 @@ struct SptrsvWrap { team.team_barrier(); KokkosBatched::TeamLU::invoke(team, LU); - // A = LU - // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsv first + // Ax = LUx = Lz = b, we use the change of variable z = U*x + // z = L^-1 * b, first we solve for z, storing the result back into b + // x = U^-1 * z, second we solve for x, again storing the result back into b + team.team_barrier(); + KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); + team.team_barrier(); KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); - - team.team_barrier(); - KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); } // serial divide. b /= A (b = b * A^-1) @@ -278,21 +345,44 @@ struct SptrsvWrap { // Need a temp block to do LU of A const auto block_size_ = b.size(); - KK_KERNEL_REQUIRE_MSG(block_size_ <= MAX_VEC_SIZE, - "Max supported block size for range-policy is 16. Use team-policy alg if you need more."); - Block LU(&buff[0], block_size_, block_size_); assign(LU, A); KokkosBatched::SerialLU::invoke(LU); - // A = LU - // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsv first + // Ax = LUx = Lz = b, we use the change of variable z = U*x + // z = L^-1 * b, first we solve for z, storing the result back into b + // x = U^-1 * z, second we solve for x, again storing the result back into b + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + } - KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const CBlock &A, const CVector &b, ArrayType &ca) { + Vector c(&ca.m_data[0], b.size()); + multiply_subtract(A, b, c); + } + + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const CBlock &A, const CVector &b, Vector &c) { + // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 + KokkosBlas::SerialGemv::invoke(-1.0, A, b, 1.0, + c); + } + + KOKKOS_INLINE_FUNCTION + static void copy(const member_type &team, const Vector &lhs_, ArrayType &rhsa) { + CVector rhs_(&rhsa.m_data[0], lhs_.size()); + assign(team, lhs_, rhs_); + } + + KOKKOS_INLINE_FUNCTION + static void copy(const Vector &lhs_, ArrayType &rhsa) { + CVector rhs_(&rhsa.m_data[0], lhs_.size()); + assign(lhs_, rhs_); } // lget @@ -331,58 +421,69 @@ struct SptrsvWrap { */ template struct Intermediate : public Common { - using Base = Common; + using Base = Common; + using accum_t = std::conditional_t; Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {} - struct ReduceFunctorBasic { + struct ReduceSumFunctor { const Base *m_obj; + const lno_t rowid; + lno_t diag; KOKKOS_INLINE_FUNCTION - ReduceFunctorBasic(const Base *obj, const lno_t = 0) : m_obj(obj) {} - - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const scalar_t &val, const scalar_t &lhs_col_val, scalar_t &accum) { - accum -= val * lhs_col_val; - } - - KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t &accum) const { + void operator()(size_type i, accum_t &accum) const { const auto colid = m_obj->entries(i); - multiply_subtract(m_obj->vget(i), m_obj->lget(colid), accum); + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + // accum -= val * lhs_colid; + if constexpr (BlockEnabled) { + accum_t temp; + Base::multiply_subtract(val, lhs_colid, temp); + accum += temp; + } else { + Base::multiply_subtract(val, lhs_colid, accum); + } + KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); } }; - struct ReduceFunctorBlock : public ReduceFunctorBasic { - using P = ReduceFunctorBasic; - - const size_type block_size; - const size_type b; - - KOKKOS_INLINE_FUNCTION - ReduceFunctorBlock(const Base *obj, const size_type block_size_, const size_type b_, const lno_t = 0) - : P(obj), block_size(block_size_), b(b_) {} + struct ReduceSumDiagFunctor { + const Base *m_obj; + const lno_t rowid; + mutable lno_t diag; KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t &accum) const { - const auto idx = i / block_size; - const auto colid = P::m_obj->entries(idx); - P::multiply_subtract(P::m_obj->vget(idx)(b, i % block_size), P::m_obj->lget(colid)(b), accum); + void operator()(size_type i, accum_t &accum) const { + const auto colid = m_obj->entries(i); + if (colid != rowid) { + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + // accum -= val * lhs_colid; + if constexpr (BlockEnabled) { + accum_t temp; + Base::multiply_subtract(val, lhs_colid, temp); + accum += temp; + } else { + Base::multiply_subtract(val, lhs_colid, accum); + } + } else { + diag = i; + } } }; - /** - * If we want to support Unsorted, we'll need a Functor that returns the ptr - * of the diag item (colid == rowid). Possibly via multi-reduce? The UnsortedTag - * is defined above but no policies actually use it. - */ - template KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, const int my_rank, const long node_count) const { + using reduce_item_t = typename Base::ArrayType; + using reducer_t = typename Base::SumArray; + using functor_t = std::conditional_t; + + static_assert(!((!IsSerial && BlockEnabled) && UseThreadVec), + "ThreadVectorRanges are not yet supported for block-enabled"); static_assert(!(IsSerial && UseThreadVec), "Requested thread vector range in serial?"); - static_assert(IsSorted, "Unsorted is not yet supported."); const auto rowid = Base::nodes_grouped_by_level(my_rank + node_count); const auto soffset = Base::row_map(rowid); @@ -394,76 +495,58 @@ struct SptrsvWrap { const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); // We don't need the reducer to find the diag item if sorted + functor_t rf{this, rowid, -1}; typename Base::reftype lhs_val = Base::lget(rowid); - - const auto block_size_ = BlockEnabled ? Base::get_block_size() : 1; - (void)block_size_; // Some settings do not use this var + reduce_item_t reduce = lhs_val; if constexpr (IsSerial) { KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); - if constexpr (BlockEnabled) { - for (size_type b = 0; b < block_size_; ++b) { - ReduceFunctorBlock rf(this, block_size_, b, rowid); - for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { - rf(i, lhs_val(b)); - } - } - } else { - ReduceFunctorBasic rf(this, rowid); - for (size_type i = itr_b; i < itr_e; ++i) { - rf(i, lhs_val); - } + for (auto ptr = itr_b; ptr < itr_e; ++ptr) { + rf(ptr, reduce); } + Base::copy(lhs_val, reduce); } else { KK_KERNEL_ASSERT_MSG(team != nullptr, "Cannot do team operations without team"); if constexpr (!UseThreadVec) { - if constexpr (BlockEnabled) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size_), [&](size_type b) { - ReduceFunctorBlock rf(this, block_size_, b, rowid); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size_, itr_e * block_size_), rf, - lhs_val(b)); - }); - } else { - ReduceFunctorBasic rf(this, rowid); - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, lhs_val); - } + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); + team->team_barrier(); + Base::copy(*team, lhs_val, reduce); team->team_barrier(); } else { - if constexpr (BlockEnabled) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(*team, block_size_), [&](size_type b) { - ReduceFunctorBlock rf(this, block_size_, b, rowid); - for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { - rf(i, lhs_val(b)); - } - }); - } else { - ReduceFunctorBasic rf(this, rowid); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); - } + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); + Base::copy(lhs_val, reduce); } } // If sorted, we already know the diag. Otherwise, get it from the reducer - const lno_t diag = IsLower ? eoffset - 1 : soffset; + rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; // At end, handle the diag element. We need to be careful to avoid race // conditions here. if constexpr (IsSerial) { // Serial case is easy, there's only 1 thread so just do the // add_and_divide - KK_KERNEL_ASSERT_MSG(diag != -1, "Serial should always know diag"); - Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); } else { - // Parallel sorted case is complex. All threads know what the diag is. - // If we have a team sharing the work, we need to ensure only one - // thread performs the add_and_divide (except in BlockEnabled, then - // we can use team operations). - KK_KERNEL_ASSERT_MSG(diag != -1, "Sorted should always know diag"); - if constexpr (!UseThreadVec) { - Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(diag)); + if constexpr (IsSorted) { + // Parallel sorted case is complex. All threads know what the diag is. + // If we have a team sharing the work, we need to ensure only one + // thread performs the add_and_divide (except in BlockEnabled, then + // we can use team operations). + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); + if constexpr (!UseThreadVec) { + Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(rf.diag)); + } else { + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + } } else { - Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); + // Parallel unsorted case. Only one thread should know what the diag + // item is. We have that one do the add_and_divide. + if (rf.diag != -1) { + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + } } } } diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp index b6ca3dacfd3f..37feace62e9d 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -471,6 +471,14 @@ class SPTRSVHandle { #endif } +#if defined(__clang__) && defined(KOKKOS_ENABLE_CUDA) + if (algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 && Kokkos::ArithTraits::isComplex && + std::is_same_v && block_size_ != 0) { + throw(std::runtime_error( + "sptrsv handle: SPTRSV may not work with blocks+clang+cuda+complex due to a compiler bug")); + } +#endif + #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV if (lower_tri) { // lower-triangular is stored in CSC diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp index 385367bca2df..a3f3f4ab7aee 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -179,6 +179,15 @@ struct SptrsvTest { const size_type nrows = row_map.size() - 1; for (auto alg : algs) { + // FIXME CUDA+Clang+Complex seems to expose a compiler bug +#if defined(__clang__) && defined(KOKKOS_ENABLE_CUDA) + if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1 && Kokkos::ArithTraits::isComplex && + std::is_same_v && block_size != 0) { + std::cerr << "Skipping TP1 alg test for blocked mtx. There's a compiler bug " + << "for clang+CUDA+complex" << std::endl; + continue; + } +#endif KernelHandle kh; kh.create_sptrsv_handle(alg, nrows, is_lower, block_size); if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {