From 0e4d10dc717798fa4a16dd32d2e7c8142c84342f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 17 Jan 2022 15:02:14 -0500 Subject: [PATCH 1/5] KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA is always defined --- perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp | 2 -- perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index a8b3de209b..7e4dd8fa2d 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,13 +3,11 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT #endif #endif -#endif #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp index fb9cd6297d..abc96148b1 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -3,11 +3,9 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI #endif -#endif #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI) From 368d5f2c370d716e4177c060e2fbe46e0941634b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 18 Jan 2022 11:27:36 -0500 Subject: [PATCH 2/5] Enable perf test for non-CUDA builds --- perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp | 4 +--- perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index 7e4dd8fa2d..e888609f14 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,11 +3,9 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT #endif -#endif #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp index abc96148b1..cf857c6779 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -3,7 +3,7 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) +#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI #endif From 2bd1b217c5ce3188415baffa7c5055ef6bed53c9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 18 Jan 2022 11:32:17 -0500 Subject: [PATCH 3/5] Template perf traits on execution space to avoid using Kokkos::Impl::ActiveExecutionMemorySpace --- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 69 +++++++++++++---- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 74 +++++++++++++++---- 2 files changed, 116 insertions(+), 27 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index e888609f14..d6abdb4d62 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -71,38 +71,82 @@ typedef Vector, internal_vector_length> internal_vector_type; typedef value_type internal_vector_type; #endif -template +template struct FactorizeModeAndAlgo; -template <> -struct FactorizeModeAndAlgo { +struct FactorizeModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level3::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct FactorizeModeAndAlgo { +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +struct FactorizeModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level3::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct FactorizeModeAndAlgo + : FactorizeModeAndAlgoDeviceImpl {}; #endif -template +template struct SolveModeAndAlgo; -template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level2::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +struct SolveModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level2::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct SolveModeAndAlgo + : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { @@ -272,8 +316,7 @@ int main(int argc, char *argv[]) { Kokkos::parallel_for( "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) { - typedef FactorizeModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef FactorizeModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; @@ -355,7 +398,7 @@ int main(int argc, char *argv[]) { Kokkos::parallel_for( "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp index cf857c6779..8513cad752 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -73,38 +73,86 @@ typedef Vector, internal_vector_length> internal_vector_type; typedef value_type internal_vector_type; #endif -template +template struct InverseDiagonalsModeAndAlgo; -template <> -struct InverseDiagonalsModeAndAlgo { +struct InverseDiagonalsModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level3::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_ONPENMP) template <> -struct InverseDiagonalsModeAndAlgo { +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +struct InverseDiagonalsModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level3::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif -template +template struct SolveModeAndAlgo; -template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level2::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +struct SolveModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level2::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct SolveModeAndAlgo + : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { @@ -280,8 +328,7 @@ int main(int argc, char *argv[]) { policy.set_scratch_size( 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef InverseDiagonalsModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef InverseDiagonalsModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; @@ -363,8 +410,7 @@ int main(int argc, char *argv[]) { 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; From 9d48485e646ebfc048fd243f17c2769aae53e7aa Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 16 Feb 2022 12:56:17 -0700 Subject: [PATCH 4/5] perf_test/batched: Remove lambda from BlockJacobi --- ...okkosBatched_Test_BlockJacobi_Tutorial.cpp | 237 ++++++++++++------ 1 file changed, 157 insertions(+), 80 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp index f3237d9b4f..94f58fba83 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp @@ -3,16 +3,6 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKJACOBI -#endif -#endif -#endif - -#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) - /// KokkosKernels headers #include "KokkosBatched_Util.hpp" @@ -79,6 +69,152 @@ val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, return residual; } +namespace ConstructBlockJacobi { +template +struct Task1Factorize { + private: + VT __A; + + public: + Task1Factorize(VT A) : __A(A) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, AA); + } +}; + +template +struct Task1SetIdentity { + private: + VT __A; + + public: + Task1SetIdentity(VT A) : __A(A) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamSetIdentity::invoke(member, AA); + } +}; + +template +struct Task1SolveLowerTriangular { + private: + VTA __A; + VTT __T; + + public: + Task1SolveLowerTriangular(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, TT, AA); + } +}; + +template +struct Task1SolveUpperTriangular { + private: + VTA __A; + VTT __T; + + public: + Task1SolveUpperTriangular(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, TT, + AA); + } +}; +} // namespace ConstructBlockJacobi + +template +struct Task1ApplyBlockJacobi { + private: + VTA __A; + VTX __x; + VTB __b; + + public: + Task1ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); + TeamGemv::invoke( + member, one, AA, bb, zero, xx); + } +}; + +template +struct Task2FactorizeInvert { + private: + VTA __A; + VTT __T; + + public: + Task2FactorizeInvert(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const val_type one(1); + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + + TeamLU::invoke(member, AA); + TeamCopy::invoke(member, AA, TT); + TeamSetIdentity::invoke(member, AA); + TeamTrsm::invoke(member, one, TT, AA); + TeamTrsm::invoke(member, one, TT, + AA); + } +}; + +template +struct Task2ApplyBlockJacobi { + private: + VTA __A; + VTX __x; + VTB __b; + + public: + Task2ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); + TeamGemv::invoke( + member, one, AA, bb, zero, xx); + } +}; + int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { @@ -159,44 +295,21 @@ int main(int argc, char *argv[]) { timer.reset(); Kokkos::parallel_for( "task1.factorize", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member, AA); - }); + ConstructBlockJacobi::Task1Factorize(A)); Kokkos::deep_copy(T, A); Kokkos::parallel_for( "task1.set-identity", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamSetIdentity::invoke(member, AA); - }); + ConstructBlockJacobi::Task1SetIdentity(A)); Kokkos::fence(); Kokkos::parallel_for( "task1.solve-lower-triangular", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, one, - TT, AA); - }); + ConstructBlockJacobi::Task1SolveLowerTriangular(A, T)); Kokkos::fence(); Kokkos::parallel_for( "task1.solve-upper-triangular", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, - one, TT, - AA); - }); + ConstructBlockJacobi::Task1SolveUpperTriangular(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -211,16 +324,8 @@ int main(int argc, char *argv[]) { policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::parallel_for( "task1.apply-block-jacobi", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv::invoke(member, one, AA, bb, - zero, xx); - }); + Task1ApplyBlockJacobi(A, x, + b)); const double t = timer.seconds(); printf( "task 1: application of jacobi time = %f , # of applications per " @@ -256,23 +361,7 @@ int main(int argc, char *argv[]) { timer.reset(); Kokkos::parallel_for( "task2.factorize-invert", policy, - KOKKOS_LAMBDA(const member_type &member) { - const val_type one(1); - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - - TeamLU::invoke(member, AA); - TeamCopy::invoke(member, AA, TT); - TeamSetIdentity::invoke(member, AA); - TeamTrsm::invoke(member, one, - TT, AA); - TeamTrsm::invoke(member, - one, TT, - AA); - }); + Task2FactorizeInvert(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -287,16 +376,8 @@ int main(int argc, char *argv[]) { policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::parallel_for( "task2.apply-block-jacobi", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv::invoke(member, one, AA, bb, - zero, xx); - }); + Task2ApplyBlockJacobi(A, x, + b)); const double t = timer.seconds(); printf( "task 2: application of jacobi time = %f , # of applications per " @@ -318,7 +399,3 @@ int main(int argc, char *argv[]) { return 0; } - -#else -int main() { return 0; } -#endif From 9ab0ecf790c1c6242263e8e5cb670e337bd4e576 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 16 Feb 2022 13:12:27 -0700 Subject: [PATCH 5/5] perf_test/batched: Remove lambda from BlockTridiagDirect --- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 212 +++++++++--------- 1 file changed, 107 insertions(+), 105 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index d6abdb4d62..ffa6efec5e 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,12 +3,6 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)) -#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT -#endif - -#if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) - /// KokkosKernels headers #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" @@ -43,11 +37,13 @@ #define KOKKOSBATCHED_USE_128BIT_MEMORY_INST -typedef Kokkos::DefaultExecutionSpace exec_space; -typedef typename exec_space::memory_space memory_space; -typedef Kokkos::DefaultHostExecutionSpace host_space; +using exec_space_type = Kokkos::DefaultExecutionSpace; +using memory_space_type = exec_space_type::memory_space; +using host_space_type = Kokkos::DefaultHostExecutionSpace; -typedef double value_type; +using value_type = double; +using policy_type = Kokkos::TeamPolicy; +using member_type = typename policy_type::member_type; /// 128*128*128/16*5 * (2*8) / 16 /// @@ -56,10 +52,10 @@ typedef double value_type; using namespace KokkosBatched; static constexpr int vector_length = - DefaultVectorLength::value; + DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) static constexpr int internal_vector_length = - DefaultInternalVectorLength::value; + DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif @@ -149,6 +145,83 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif +template +struct SetTridiagToIdentity { + private: + VT __AA; + + public: + SetTridiagToIdentity(VT AA) : __AA(AA) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, __AA.extent(5)), + [&](const int &v) { + for (int k = 0, kend = __AA.extent(3); k < kend; ++k) + __AA(i, j, 1, k, k, v) = 1; + }); + }); + } +}; + +template +struct Factorize { + private: + VT __AA; + LT __L; + + public: + Factorize(VT AA, LT L) : __AA(AA), __L(L) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + typedef FactorizeModeAndAlgo + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank(); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { + auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), v); + + /// subview patterns + auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + + if (__L == 1) { + A.assign_data(&AAA(0, 1, 0, 0)); + LU::invoke(member, A); + } else { + for (int k = 0; k < (__L - 1); ++k) { + A.assign_data(&AAA(k, 1, 0, 0)); + B.assign_data(&AAA(k, 2, 0, 0)); + C.assign_data(&AAA(k, 0, 0, 0)); + D.assign_data(&AAA(k + 1, 1, 0, 0)); + + LU::invoke(member, A); + Trsm::invoke(member, 1.0, A, B); + Trsm::invoke(member, 1.0, A, + C); + Gemm::invoke(member, -1.0, C, B, 1.0, D); + } + LU::invoke(member, D); + } + }); + } +}; + int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { @@ -189,53 +262,56 @@ int main(int argc, char *argv[]) { /// /// double 16 - Kokkos::View Av( + Kokkos::View Av( "A", N / vector_length, L, 3, Blk, Blk); /// double - Kokkos::View As( + Kokkos::View As( (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View + Kokkos::View Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length / internal_vector_length); /// double 16 - Kokkos::View xv( + Kokkos::View xv( "x", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View xs( + Kokkos::View xs( (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length); /// double 2 - Kokkos::View + Kokkos::View xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length / internal_vector_length); /// double 16 - Kokkos::View bv( + Kokkos::View bv( "b", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View bs( + Kokkos::View bs( (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length); /// double 2 - Kokkos::View + Kokkos::View bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length / internal_vector_length); /// double copy of A - Kokkos::View Acopy( + Kokkos::View Acopy( "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), As.extent(4), As.extent(5)); - Kokkos::View rs( + Kokkos::View rs( "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), bs.extent(4)); @@ -257,24 +333,9 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for( - "setTridiagToIdentity", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, AA.extent(1)), - [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - for (int k = 0, kend = AA.extent(3); k < kend; ++k) - AA(i, j, 1, k, k, v) = 1; - }); - }); - }); + Kokkos::parallel_for("setTridiagToIdentity", policy, + SetTridiagToIdentity(AA)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -286,7 +347,7 @@ int main(int argc, char *argv[]) { /// randomize input { const value_type one(1); - Kokkos::Random_XorShift64_Pool random(13245); + Kokkos::Random_XorShift64_Pool random(13245); Kokkos::fill_random(As, random, one); Kokkos::fill_random(bs, random, one); @@ -301,9 +362,7 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - int team_size = 0; + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -313,58 +372,9 @@ int main(int argc, char *argv[]) { } policy_type policy(AA.extent(0), team_size, AA.extent(5)); - Kokkos::parallel_for( - "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef FactorizeModeAndAlgo - default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - const int i = member.league_rank(); - - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto AAA = - Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL(), v); - - /// subview patterns - auto A = - Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - auto B = - Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); - auto C = - Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - auto D = - Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - - if (L == 1) { - A.assign_data(&AAA(0, 1, 0, 0)); - LU::invoke(member, A); - } else { - for (int k = 0; k < (L - 1); ++k) { - A.assign_data(&AAA(k, 1, 0, 0)); - B.assign_data(&AAA(k, 2, 0, 0)); - C.assign_data(&AAA(k, 0, 0, 0)); - D.assign_data(&AAA(k + 1, 1, 0, 0)); - - LU::invoke(member, A); - Trsm::invoke(member, 1.0, A, B); - Trsm::invoke(member, 1.0, A, C); - Gemm::invoke(member, -1.0, C, B, - 1.0, D); - } - LU::invoke(member, D); - } - }); - }); + Kokkos::parallel_for("factorize", + policy.set_scratch_size(0, Kokkos::PerTeam(S)), + Factorize(AA, L)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -382,9 +392,7 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - int team_size = 0; + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -527,8 +535,6 @@ int main(int argc, char *argv[]) { /// if (1) { typedef KokkosBatched::Algo::Level2::Unblocked algo_type; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); Kokkos::parallel_for( "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) { @@ -678,7 +684,3 @@ int main(int argc, char *argv[]) { return 0; } - -#else -int main() { return 0; } -#endif