diff --git a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index a106d0ae8f..d7d84ea253 100644 --- a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -62,10 +62,12 @@ namespace KokkosBatched { template template + typename KrylovHandleType, typename TMPViewType, + typename TMPNormViewType> KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { + const VectorViewType& _X, const KrylovHandleType& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; @@ -73,41 +75,30 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( const size_t maximum_iteration = handle.get_max_iteration(); const MagnitudeType tolerance = handle.get_tolerance(); - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; using TeamVectorCopy1D = TeamVectorCopy; const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - ScratchPadVectorViewType P( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType Q( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType R( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType X( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - - ScratchPadNormViewType sqr_norm_0( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType sqr_norm_j( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType alpha( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType mask( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType tmp( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + int offset_P = 0; + int offset_Q = offset_P + numRows; + int offset_R = offset_Q + numRows; + int offset_X = offset_R + numRows; + + auto P = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_P, offset_P + numRows)); + auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_Q, offset_Q + numRows)); + auto R = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_R, offset_R + numRows)); + auto X = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + numRows)); + + auto sqr_norm_0 = Kokkos::subview(_TMPNormView, Kokkos::ALL, 0); + auto sqr_norm_j = Kokkos::subview(_TMPNormView, Kokkos::ALL, 1); + auto alpha = Kokkos::subview(_TMPNormView, Kokkos::ALL, 2); + auto mask = Kokkos::subview(_TMPNormView, Kokkos::ALL, 3); + auto tmp = Kokkos::subview(_TMPNormView, Kokkos::ALL, 4); TeamVectorCopy::invoke(member, _X, X); // Deep copy of b into r_0: @@ -200,6 +191,61 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( TeamVectorCopy::invoke(member, X, _X); return status; } + +template +template +KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + const int strategy = handle.get_memory_strategy(); + if (strategy == 0) { + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadNormViewType = Kokkos::View< + typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; + + const int numMatrices = _X.extent(0); + const int numRows = _X.extent(1); + + ScratchPadVectorViewType _TMPView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + 4 * numRows); + + ScratchPadNormViewType _TMPNormView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + + return invoke( + member, A, _B, _X, handle, _TMPView, _TMPNormView); + } + if (strategy == 1) { + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + using ScratchPadNormViewType = Kokkos::View< + typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; + + const int numMatrices = _X.extent(0); + + auto _TMPView = Kokkos::subview( + handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + ScratchPadNormViewType _TMPNormView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + + return invoke( + member, A, _B, _X, handle, _TMPView, _TMPNormView); + } + return 0; +} + } // namespace KokkosBatched #endif diff --git a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index cd7a478548..adb5b19121 100644 --- a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -60,10 +60,12 @@ namespace KokkosBatched { /// template -template +template KOKKOS_INLINE_FUNCTION int TeamCG::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandle& handle) { + const VectorViewType& _X, const KrylovHandle& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; @@ -71,41 +73,30 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( size_t maximum_iteration = handle.get_max_iteration(); const MagnitudeType tolerance = handle.get_tolerance(); - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; using TeamCopy1D = TeamCopy; const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - ScratchPadVectorViewType P( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType Q( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType R( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType X( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - - ScratchPadNormViewType sqr_norm_0( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType sqr_norm_j( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType alpha( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType mask( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType tmp( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + int offset_P = 0; + int offset_Q = offset_P + numRows; + int offset_R = offset_Q + numRows; + int offset_X = offset_R + numRows; + + auto P = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_P, offset_P + numRows)); + auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_Q, offset_Q + numRows)); + auto R = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_R, offset_R + numRows)); + auto X = Kokkos::subview(_TMPView, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + numRows)); + + auto sqr_norm_0 = Kokkos::subview(_TMPNormView, Kokkos::ALL, 0); + auto sqr_norm_j = Kokkos::subview(_TMPNormView, Kokkos::ALL, 1); + auto alpha = Kokkos::subview(_TMPNormView, Kokkos::ALL, 2); + auto mask = Kokkos::subview(_TMPNormView, Kokkos::ALL, 3); + auto tmp = Kokkos::subview(_TMPNormView, Kokkos::ALL, 4); TeamCopy::invoke(member, _X, X); // Deep copy of b into r_0: @@ -199,6 +190,60 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( return status; } +template +template +KOKKOS_INLINE_FUNCTION int TeamCG::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + const int strategy = handle.get_memory_strategy(); + if (strategy == 0) { + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadNormViewType = Kokkos::View< + typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; + + const int numMatrices = _X.extent(0); + const int numRows = _X.extent(1); + + ScratchPadVectorViewType _TMPView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + 4 * numRows); + + ScratchPadNormViewType _TMPNormView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + + return invoke( + member, A, _B, _X, handle, _TMPView, _TMPNormView); + } + if (strategy == 1) { + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + using ScratchPadNormViewType = Kokkos::View< + typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; + + const int numMatrices = _X.extent(0); + + auto _TMPView = Kokkos::subview( + handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + ScratchPadNormViewType _TMPNormView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + + return invoke( + member, A, _B, _X, handle, _TMPView, _TMPNormView); + } + return 0; +} + } // namespace KokkosBatched #endif diff --git a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index 7fdf244fa7..c858d36c6f 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -66,20 +66,18 @@ namespace KokkosBatched { template template + typename PrecOperatorType, typename KrylovHandleType, + typename ArnoldiViewType, typename TMPViewType> KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle) { + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; typedef Kokkos::Details::ArithTraits ATM; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; using TeamVectorCopy1D = TeamVectorCopy; const OrdinalType numMatrices = _X.extent(0); @@ -99,51 +97,36 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( int offset_H = offset_V + n_V; int offset_Givens = offset_H + n_H; - const int first_matrix = handle.first_index(member.league_rank()); - const int last_matrix = handle.last_index(member.league_rank()); - - auto V_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_H, offset_H + n_H)); auto Givens_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + _ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; - int n_X = numRows; int n_mask = 1; - int n_tmp = 1; int offset_G = 0; int offset_W = offset_G + n_G; - int offset_X = offset_W + n_W; - int offset_mask = offset_X + n_X; + int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - ScratchPadVectorViewType tmp_2D( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_X + n_mask + n_tmp); - - auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + auto G = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + auto W = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_W, offset_W + n_W)); - auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + n_X)); - auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); - auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); + auto mask = Kokkos::subview(_TMPView, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(_TMPView, Kokkos::ALL, offset_tmp); - TeamVectorCopy::invoke(member, _X, X); // Deep copy of b into r_0: TeamVectorCopy::invoke(member, _B, W); // r_0 := b - A x_0 member.team_barrier(); - A.template apply(member, X, W, -1, 1); + A.template apply(member, _X, W, -1, 1); member.team_barrier(); P.template apply(member, W, W); @@ -333,26 +316,23 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( TeamVectorGemv::invoke( member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), - Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); - member.team_barrier(); // Finish writing to X + Kokkos::subview(G, Kokkos::ALL, first_indices), 1, _X); + member.team_barrier(); // Finish writing to _X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { TeamVectorAxpy::invoke( member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); - member.team_barrier(); // Finish writing to X + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); + member.team_barrier(); // Finish writing to _X } } - TeamVectorCopy::invoke(member, X, _X); - - member.team_barrier(); - if (handle.get_compute_last_residual()) { TeamVectorCopy::invoke(member, _B, W); member.team_barrier(); - A.template apply(member, X, W, -1, 1); + A.template apply(member, _X, W, -1, + 1); member.team_barrier(); P.template apply(member, W, W); member.team_barrier(); @@ -369,6 +349,101 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( return status; } +template +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle) { + const int strategy = handle.get_memory_strategy(); + if (strategy == 0) { + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto _ArnoldiView = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::ALL); + + const int numMatrices = _X.extent(0); + const int numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_mask = 1; + int n_tmp = 1; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + + ScratchPadVectorViewType _TMPView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); + + return invoke(member, A, _B, _X, P, handle, _ArnoldiView, + _TMPView); + } + if (strategy == 1) { + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto _ArnoldiView = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::ALL); + + auto _TMPView = Kokkos::subview( + handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + return invoke(member, A, _B, _X, P, handle, _ArnoldiView, + _TMPView); + } + if (strategy == 2) { + using ScratchPadArnoldiViewType = Kokkos::View< + typename VectorViewType::non_const_value_type***, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + + const int numMatrices = _X.extent(0); + const int numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_mask = 1; + int n_tmp = 1; + + ScratchPadArnoldiViewType _ArnoldiView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + maximum_iteration, numRows + maximum_iteration + 3); + + ScratchPadVectorViewType _TMPView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); + + return invoke(member, A, _B, _X, P, handle, _ArnoldiView, + _TMPView); + } + return 0; +} + template template diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 41ac90e61d..933ef97adf 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -65,20 +65,18 @@ namespace KokkosBatched { template template + typename PrecOperatorType, typename KrylovHandleType, + typename ArnoldiViewType, typename TMPViewType> KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle) { + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; typedef Kokkos::Details::ArithTraits ATM; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; using TeamCopy1D = TeamCopy; const OrdinalType numMatrices = _X.extent(0); @@ -98,51 +96,36 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( int offset_H = offset_V + n_V; int offset_Givens = offset_H + n_H; - const int first_matrix = handle.first_index(member.league_rank()); - const int last_matrix = handle.last_index(member.league_rank()); - - auto V_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_H, offset_H + n_H)); auto Givens_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + _ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; - int n_X = numRows; int n_mask = 1; - int n_tmp = 1; int offset_G = 0; int offset_W = offset_G + n_G; - int offset_X = offset_W + n_W; - int offset_mask = offset_X + n_X; + int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - ScratchPadVectorViewType tmp_2D( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_X + n_mask + n_tmp); - - auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + auto G = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + auto W = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_W, offset_W + n_W)); - auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + n_X)); - auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); - auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); + auto mask = Kokkos::subview(_TMPView, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(_TMPView, Kokkos::ALL, offset_tmp); - TeamCopy::invoke(member, _X, X); // Deep copy of b into r_0: TeamCopy::invoke(member, _B, W); // r_0 := b - A x_0 member.team_barrier(); - A.template apply(member, X, W, -1, 1); + A.template apply(member, _X, W, -1, 1); member.team_barrier(); P.template apply(member, W, W); @@ -330,26 +313,22 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( TeamGemv::invoke( member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), - Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); - member.team_barrier(); // Finish writing to X + Kokkos::subview(G, Kokkos::ALL, first_indices), 1, _X); + member.team_barrier(); // Finish writing to _X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { TeamAxpy::invoke( member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); - member.team_barrier(); // Finish writing to X + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); + member.team_barrier(); // Finish writing to _X } } - TeamCopy::invoke(member, X, _X); - - member.team_barrier(); - if (handle.get_compute_last_residual()) { TeamCopy::invoke(member, _B, W); member.team_barrier(); - A.template apply(member, X, W, -1, 1); + A.template apply(member, _X, W, -1, 1); member.team_barrier(); P.template apply(member, W, W); member.team_barrier(); @@ -366,6 +345,101 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( return status; } +template +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle) { + const int strategy = handle.get_memory_strategy(); + if (strategy == 0) { + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto _ArnoldiView = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::ALL); + + const int numMatrices = _X.extent(0); + const int numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_mask = 1; + int n_tmp = 1; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + + ScratchPadVectorViewType _TMPView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); + + return invoke(member, A, _B, _X, P, handle, _ArnoldiView, + _TMPView); + } + if (strategy == 1) { + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto _ArnoldiView = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::ALL); + + auto _TMPView = Kokkos::subview( + handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + return invoke(member, A, _B, _X, P, handle, _ArnoldiView, + _TMPView); + } + if (strategy == 2) { + using ScratchPadArnoldiViewType = Kokkos::View< + typename VectorViewType::non_const_value_type***, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + + const int numMatrices = _X.extent(0); + const int numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_mask = 1; + int n_tmp = 1; + + ScratchPadArnoldiViewType _ArnoldiView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + maximum_iteration, numRows + maximum_iteration + 3); + + ScratchPadVectorViewType _TMPView( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); + + return invoke(member, A, _B, _X, P, handle, _ArnoldiView, + _TMPView); + } + return 0; +} + template template diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 7e812621e7..45f7aa5819 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -53,7 +53,7 @@ namespace KokkosBatched { /// ==================== struct TeamVectorSpmvInternal { template + typename OrdinalType, typename layout, int dobeta, unsigned N_team> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, @@ -68,7 +68,7 @@ struct TeamVectorSpmvInternal { const OrdinalType ys1); template + typename OrdinalType, typename layout, int dobeta, unsigned N_team> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType alpha, @@ -83,7 +83,7 @@ struct TeamVectorSpmvInternal { }; template + typename OrdinalType, typename layout, int dobeta, unsigned N_team> KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, @@ -96,43 +96,117 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + if (member.team_size() == 1) { + if (N_team > 1 && valuess0 == 1) { + /* + Left layout as valuess0 = 1 and non-zero vector length given at + compilation time. Here we use the SIMD data type which is using Intel + Intrinsics under the hood on Intel architectures. + */ + typedef Vector, N_team> VectorType; + VectorType alpha_v, beta_v, values_v, y_v, x_v; + + alpha_v.loadAligned(alpha); + beta_v.loadAligned(beta); + + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + + VectorType sum_v(0); + #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + values_v.loadAligned( + &values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); + x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs1]); + sum_v += values_v * x_v; } + sum_v *= alpha_v; + if (dobeta != 0) { + y_v.loadAligned(&Y[iRow * ys1]); + sum_v += y_v * beta_v; + } + sum_v.storeAligned(&Y[iRow * ys1]); + } + } else { + for (unsigned iMatrix = 0; iMatrix < unsigned(numMatrices); ++iMatrix) { + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + + ValueType sum = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, rowLength), + [&](const OrdinalType& iEntry, ValueType& lsum) { + lsum += + values[iMatrix * valuess0 + + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs1]; + }, + sum); - sum *= alpha[iMatrix * alphas0]; + sum *= alpha[iMatrix * alphas0]; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = + beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + } } - }); + } + } + } else { +#endif + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, + iMatrix); + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs1]; + } + + sum *= alpha[iMatrix * alphas0]; + + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = + beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + } +#endif return 0; } template + typename OrdinalType, typename layout, int dobeta, unsigned N_team> KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType alpha, @@ -143,43 +217,114 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + if (member.team_size() == 1) { + if (N_team > 1 && valuess0 == 1) { + /* + Left layout as valuess0 = 1 and non-zero vector length given at + compilation time Here we use the SIMD data type which is using Intel + Intrinsics under the hood on Intel architectures. + */ + typedef Vector, N_team> VectorType; + VectorType alpha_v(alpha), beta_v(beta), values_v, y_v, x_v; + + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + + VectorType sum_v(0); + #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + values_v.loadAligned( + &values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); + x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs1]); + sum_v += values_v * x_v; + } + sum_v *= alpha_v; + if (dobeta != 0) { + y_v.loadAligned(&Y[iRow * ys1]); + sum_v += y_v * beta_v; } + sum_v.storeAligned(&Y[iRow * ys1]); + } + } else { + for (unsigned iMatrix = 0; iMatrix < unsigned(numMatrices); ++iMatrix) { + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - sum *= alpha; + ValueType sum = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, rowLength), + [&](const OrdinalType& iEntry, ValueType& lsum) { + lsum += + values[iMatrix * valuess0 + + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs1]; + }, + sum); - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + sum *= alpha; + + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = + beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + } } - }); + } + } + } else { +#endif + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, + iMatrix); + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs1]; + } + + sum *= alpha; + + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = + beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + } +#endif return 0; } -template -struct TeamVectorSpmv { +template +struct TeamVectorSpmv { template @@ -272,7 +417,7 @@ struct TeamVectorSpmv { MemberType, typename alphaViewType::non_const_value_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta>( + typename ValuesViewType::array_layout, dobeta, N_team>( member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), @@ -351,7 +496,7 @@ struct TeamVectorSpmv { typename ValuesViewType::non_const_value_type>::mag_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta>( + typename ValuesViewType::array_layout, dobeta, N_team>( member, X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), diff --git a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp index d7fd94744f..a3400db839 100644 --- a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp +++ b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp @@ -111,14 +111,25 @@ class CrsMatrix { MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), MagnitudeType beta = Kokkos::Details::ArithTraits::zero()) const { - if (beta == Kokkos::Details::ArithTraits::zero()) - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 0>( - member, alpha, values, row_ptr, colIndices, X, beta, Y); - else - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 1>( - member, alpha, values, row_ptr, colIndices, X, beta, Y); + if (beta == Kokkos::Details::ArithTraits::zero()) { + if (member.team_size() == 1 && n_operators == 8) + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 0>( + member, alpha, values, row_ptr, colIndices, X, beta, Y); + else + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 0>( + member, alpha, values, row_ptr, colIndices, X, beta, Y); + } else { + if (member.team_size() == 1 && n_operators == 8) + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 1>( + member, alpha, values, row_ptr, colIndices, X, beta, Y); + else + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 1>( + member, alpha, values, row_ptr, colIndices, X, beta, Y); + } } template diff --git a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index 3467a6f910..4fe3edaf3e 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -103,6 +103,7 @@ class KrylovHandle { int n_teams; int ortho_strategy; int scratch_pad_level; + int memory_strategy; bool compute_last_residual; bool monitor_residual; bool host_synchronised; @@ -123,7 +124,7 @@ class KrylovHandle { iteration_numbers = IntViewType("", batched_size); Kokkos::deep_copy(iteration_numbers, -1); - n_teams = ceil(1. * batched_size / N_team); + n_teams = ceil(static_cast(batched_size) / N_team); first_index = IntViewType("", n_teams); last_index = IntViewType("", n_teams); @@ -146,6 +147,7 @@ class KrylovHandle { scratch_pad_level = 0; compute_last_residual = true; host_synchronised = false; + memory_strategy = 0; } /// \brief get_number_of_systems_per_team @@ -411,6 +413,14 @@ class KrylovHandle { return false; } + KOKKOS_INLINE_FUNCTION + void set_memory_strategy(int _memory_strategy) { + memory_strategy = _memory_strategy; + } + + KOKKOS_INLINE_FUNCTION + int get_memory_strategy() const { return memory_strategy; } + private: /// \brief set_norm /// Store the norm of one of the system at one of the iteration diff --git a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp index 413c72678f..240bd56c74 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp @@ -66,6 +66,14 @@ struct SerialGMRES { template struct TeamGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView); template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, @@ -85,6 +93,14 @@ struct TeamGMRES { template struct TeamVectorGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView); template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, @@ -104,6 +120,13 @@ struct TeamVectorGMRES { template struct TeamCG { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, @@ -115,6 +138,13 @@ struct TeamCG { template struct TeamVectorCG { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, diff --git a/batched/sparse/src/KokkosBatched_Spmv.hpp b/batched/sparse/src/KokkosBatched_Spmv.hpp index 14ce074e41..06aa92062c 100644 --- a/batched/sparse/src/KokkosBatched_Spmv.hpp +++ b/batched/sparse/src/KokkosBatched_Spmv.hpp @@ -215,7 +215,8 @@ struct TeamSpmv { /// (or one with TeamVectorRange) are used inside. /// -template +template struct TeamVectorSpmv { template :: - template invoke( - member, alpha, d, _r, _c, x, beta, y); + if (last_matrix != N) + KokkosBatched::TeamVectorSpmv< + MemberType, typename ParamTagType::trans, + 2>::template invoke( + member, alpha, d, _r, _c, x, beta, y); + else + KokkosBatched::TeamVectorSpmv:: + template invoke( + member, alpha, d, _r, _c, x, beta, y); } inline void run() { @@ -85,7 +92,8 @@ struct Functor_TestBatchedTeamVectorSpmv { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::TeamPolicy policy( - _D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); + ceil(static_cast(_D.extent(0)) / _N_team), Kokkos::AUTO(), + Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -187,12 +195,12 @@ int test_batched_teamvector_spmv() { for (int i = 3; i < 10; ++i) { Test::TeamVectorSpmv::impl_test_batched_spmv< DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 0>(1024, i, 2); + alphaViewType, alphaViewType, 0>(1025, i, 2); } for (int i = 3; i < 10; ++i) { Test::TeamVectorSpmv::impl_test_batched_spmv< DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 1>(1024, i, 2); + alphaViewType, alphaViewType, 1>(1025, i, 2); } } #endif @@ -207,12 +215,12 @@ int test_batched_teamvector_spmv() { for (int i = 3; i < 10; ++i) { Test::TeamVectorSpmv::impl_test_batched_spmv< DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 0>(1024, i, 2); + alphaViewType, alphaViewType, 0>(1025, i, 2); } for (int i = 3; i < 10; ++i) { Test::TeamVectorSpmv::impl_test_batched_spmv< DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 1>(1024, i, 2); + alphaViewType, alphaViewType, 1>(1025, i, 2); } } #endif