From cd8a389fa408e4e7e33c8ab7ca48cbdf231f1f25 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 16 Apr 2021 16:14:23 -0600 Subject: [PATCH] Add fast two-level mode N GEMV Very close to cublas in performance, with LayoutLeft or LayoutRight. (~100 gflops for 1e5 * 1e5 double matrix on V100) --- perf_test/blas/CMakeLists.txt | 1 + perf_test/blas/blas2/CMakeLists.txt | 7 + .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 204 ++++++++++++++++ src/blas/impl/KokkosBlas2_gemv_impl.hpp | 218 +++++++++++++++++- src/blas/impl/KokkosBlas2_gemv_spec.hpp | 14 +- unit_test/blas/Test_Blas2_gemv.hpp | 26 ++- 6 files changed, 443 insertions(+), 27 deletions(-) create mode 100644 perf_test/blas/blas2/CMakeLists.txt create mode 100644 perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp diff --git a/perf_test/blas/CMakeLists.txt b/perf_test/blas/CMakeLists.txt index 2d93de0458..4c347f1d83 100644 --- a/perf_test/blas/CMakeLists.txt +++ b/perf_test/blas/CMakeLists.txt @@ -1,2 +1,3 @@ ADD_SUBDIRECTORY(blas) +ADD_SUBDIRECTORY(blas2) ADD_SUBDIRECTORY(blas3) diff --git a/perf_test/blas/blas2/CMakeLists.txt b/perf_test/blas/blas2/CMakeLists.txt new file mode 100644 index 0000000000..f69c576cd3 --- /dev/null +++ b/perf_test/blas/blas2/CMakeLists.txt @@ -0,0 +1,7 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOSKERNELS_ADD_EXECUTABLE( + KokkosBlas2_gemv_perf_test + SOURCES KokkosBlas2_gemv_perf_test.cpp +) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp new file mode 100644 index 0000000000..67949af165 --- /dev/null +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -0,0 +1,204 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosBlas2_gemv.hpp" +#include + +struct Params +{ + int use_cuda = 0; + int use_openmp = 0; + int use_threads = 0; + int m = 5000; + int n = 5000; + int repeat = 1; + bool layoutLeft = true; +}; + +void print_options(){ + std::cerr << "Options\n" << std::endl; + + std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl; + std::cerr << "\tIf none selected, serial is used." << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall spadd (symbolic + repeated numeric)" << std::endl; + std::cerr << "\t[Optional] --layout :: matrix layout ('left' or 'right', default 'left')" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" << std::endl; +} + +int parse_inputs (Params& params, int argc, char **argv){ + for ( int i = 1 ; i < argc ; ++i ) { + if ( 0 == strcasecmp( argv[i] , "--help") || 0 == strcasecmp( argv[i] , "-h" )) { + print_options(); + exit(0); //note: this is before Kokkos::initialize + } + else if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { + params.use_threads = atoi( argv[++i] ); + } + else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) { + params.use_openmp = atoi( argv[++i] ); + } + else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { + params.use_cuda = atoi( argv[++i] ) + 1; + } + else if ( 0 == strcasecmp( argv[i] , "--layout" ) ) { + i++; + if(0 == strcasecmp( argv[i] , "left")) + params.layoutLeft = true; + else if(0 == strcasecmp( argv[i] , "right")) + params.layoutLeft = false; + else + { + std::cerr << "Invalid layout: must be 'left' or 'right'.\n"; + exit(1); + } + } + else if( 0 == strcasecmp( argv[i], "--m" )) + { + params.m = atoi(argv[++i]); + } + else if( 0 == strcasecmp( argv[i], "--n" )) + { + params.n = atoi(argv[++i]); + } + else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { + //if provided, C will be written to given file. + //has to have ".bin", or ".crs" extension. + params.repeat = atoi( argv[++i] ); + } + else { + std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; + print_options(); + return 1; + } + } + return 0; +} + +template +void run(int m, int n, int repeat) +{ + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + std::cout << "Running GEMV experiment (" << ExecSpace::name() << ")\n"; + Kokkos::View A(Kokkos::ViewAllocateWithoutInitializing("A"), m, n); + Kokkos::View x(Kokkos::ViewAllocateWithoutInitializing("x"), n); + Kokkos::View y(Kokkos::ViewAllocateWithoutInitializing("y"), m); + Kokkos::Random_XorShift64_Pool pool(123); + Kokkos::fill_random(A, pool, 10.0); + Kokkos::fill_random(x, pool, 10.0); + //Do a warm-up run + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + //Now, start timing + Kokkos::fence(); + Kokkos::Timer timer; + for(int i = 0; i < repeat; i++) + { + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + ExecSpace().fence(); + } + double total = timer.seconds(); + double avg = total / repeat; + size_t flopsPerRun = (size_t) m * n; + printf("Avg GEMV time: %f s.\n", avg); + printf("Avg GEMV FLOP/s: %.3e\n", flopsPerRun / avg); +} + +int main (int argc, char ** argv){ + Params params; + + if (parse_inputs (params, argc, argv) ){ + return 1; + } + const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads + const int device_id = params.use_cuda - 1; + + Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); + + bool useOMP = params.use_openmp != 0; + bool useCUDA = params.use_cuda != 0; + + bool useSerial = !useOMP && !useCUDA; + + if(useOMP) + { +#if defined( KOKKOS_ENABLE_OPENMP ) + if(params.layoutLeft) + run(params.m, params.n, params.repeat); + else + run(params.m, params.n, params.repeat); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + if(useCUDA) + { +#if defined( KOKKOS_ENABLE_CUDA ) + if(params.layoutLeft) + run(params.m, params.n, params.repeat); + else + run(params.m, params.n, params.repeat); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + if(useSerial) + { +#if defined( KOKKOS_ENABLE_SERIAL ) + if(params.layoutLeft) + run(params.m, params.n, params.repeat); + else + run(params.m, params.n, params.repeat); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + Kokkos::finalize(); + return 0; +} + diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index db5bc9fbca..3c399278ca 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -46,6 +46,7 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "Kokkos_ArithTraits.hpp" namespace KokkosBlas { @@ -480,6 +481,132 @@ singleLevelGemv (const char trans[], } } +struct TwoLevelGEMV_LayoutLeftTag {}; +struct TwoLevelGEMV_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- +// Functor for a two-level parallel_reduce version of GEMV (non-transpose), +// designed for performance on GPU. Kernel depends on the layout of A. +template +struct TwoLevelGEMV { + using y_value_type = typename YViewType::non_const_value_type; + using AlphaCoeffType = typename AViewType::non_const_value_type; + using BetaCoeffType = typename YViewType::non_const_value_type; + + + using execution_space = typename AViewType::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TwoLevelGEMV (const AlphaCoeffType& alpha, + const AViewType& A, + const XViewType& x, + const BetaCoeffType& beta, + const YViewType& y) : + alpha_ (alpha), A_ (A), x_ (x), beta_ (beta), y_ (y) + { + static_assert (Kokkos::Impl::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, + "XViewType must be a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, + "YViewType must be a Kokkos::View."); + static_assert (static_cast (AViewType::rank) == 2, + "AViewType must have rank 2."); + static_assert (static_cast (XViewType::rank) == 1, + "XViewType must have rank 1."); + static_assert (static_cast (YViewType::rank) == 1, + "YViewType must have rank 1."); + static_assert (std::is_integral::value, + "IndexType must be an integer."); + } + +public: + //LayoutLeft version: 32xK blocks. + // -Each team handles block rows. + // -Groups of 32 threads handle N/teamsize columns sequentially, placing results into shared. + // -Then individual thread results are combined with parallel_reduce. + KOKKOS_INLINE_FUNCTION void + operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const + { + using Kokkos::Details::ArithTraits; + using Scalar = typename YViewType::non_const_value_type; + using KAT = ArithTraits; + //Allocate a Scalar in shared for each thread + Scalar* blockResult = (Scalar*) team.team_shmem().get_shmem(32 * sizeof(Scalar)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), + [&](int i) + { + blockResult[i] = KAT::zero(); + }); + team.team_barrier(); + //Which block this thread will work on + int block = team.team_rank() / 32; + //Which row in the block this thread will work on + IndexType row = team.league_rank() * 32 + team.team_rank() % 32; + IndexType blockColStart = columnsPerThread * block; + Scalar localSum = KAT::zero(); + //compute local sum + for(IndexType col = blockColStart; col < blockColStart + columnsPerThread; col++) + { + if(col == (IndexType) A_.extent(1)) + break; + if(row < (IndexType) A_.extent(0)) + { + //A access is coalesced, x access is a broadcast + localSum += A_(row, col) * x_(col); + } + } + //atomically combine local result into shared + Kokkos::atomic_add(&blockResult[team.team_rank() % 32], localSum); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), + [&](int i) + { + IndexType yrow = team.league_rank() * 32 + i; + if(yrow < (IndexType) A_.extent(0)) + { + y_[yrow] = beta_ * y_[yrow] + alpha_ * blockResult[i]; + } + }); + } + + //LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void + operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const + { + using Kokkos::Details::ArithTraits; + using KAT = ArithTraits; + + const IndexType N = A_.extent(1); + const int i = team.league_rank(); // batch id + + // parallel-reduce to compute val += A(:,j)' * x + y_value_type val = KAT:: zero(); + Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, y_value_type &update ) { + update += A_(i, j) * x_(j); + }, val); + + // compute yj = beta*yj + alpha*val + Kokkos::single(Kokkos::PerTeam(team), + [=]() + { + y_[i] = beta_ * y_[i] + alpha_ * val; + }); + } + + IndexType columnsPerThread; +private: + AlphaCoeffType alpha_; + typename AViewType::const_type A_; + typename XViewType::const_type x_; + BetaCoeffType beta_; + YViewType y_; +}; + // --------------------------------------------------------------------------------------------- // Functor for a two-level parallel_reduce version of (conjugate) @@ -593,34 +720,66 @@ twoLevelGemv (const char trans[], using Kokkos::Details::ArithTraits; using KAT = ArithTraits; + using YKAT = ArithTraits; - const char tr = trans[0]; + const char tr = toupper(trans[0]); // The transpose and conjugate transpose cases where A has zero rows // need special handling. These are equivalent to y := beta*y. We // could implement this using KokkosBlas::scal, but we don't want to // depend on that or its implementation details. Instead, we reuse // an instantiation of the non-transpose case for alpha=0. - if (A.extent(0) == 0 && (tr != 'N' && tr != 'n')) { - if (beta == KAT::zero ()) { + if (y.extent(0) == 0) + { + //no entries to update + return; + } + else if (x.extent(0) == 0) + { + if (beta == YKAT::zero ()) { Kokkos::deep_copy (y, KAT::zero ()); } - else if (beta != Kokkos::Details::ArithTraits::one ()) { + else if (beta != YKAT::one ()) { // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. using functor_type = SingleLevelNontransposeGEMV; functor_type functor (alpha, A, x, beta, y); - Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, A.extent(1)), functor); + Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, y.extent(0)), functor); } return; } - if (tr == 'N' || tr == 'n') { - // NOTE: not implemented, so just call single-level version - singleLevelGemv - (trans, alpha, A, x, beta, y); + if (tr == 'N') { + constexpr bool isLayoutLeft = std::is_same::value; + using layout_tag = typename std::conditional::type; + using tagged_policy = Kokkos::TeamPolicy; + using functor_type = TwoLevelGEMV; + functor_type functor (alpha, A, x, beta, y); + tagged_policy team; + if(isLayoutLeft) + { + size_t sharedPerTeam = 32 * sizeof(y_value_type); + IndexType numTeams = (A.extent(0) + 31) / 32; + tagged_policy temp(1, 1); + int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag()); + //make sure teamSize is a multiple of 32 + teamSize -= teamSize % 32; + //don't make teamSize larger than what's useful + if((size_t) teamSize > 32 * A.extent(1)) + teamSize = 32 * A.extent(1); + int numBlocks = teamSize / 32; + functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks; + team = tagged_policy(numTeams, teamSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); + } + else + { + //LayoutRight: one team per row + team = tagged_policy(A.extent(0), Kokkos::AUTO); + } + Kokkos::parallel_for ("KokkosBlas::gemv[twoLevel]", team, functor); } else { if (alpha == KAT::zero () && beta == KAT::zero ()) { @@ -630,7 +789,7 @@ twoLevelGemv (const char trans[], else if (alpha == KAT::zero () && beta == KAT::one ()) { // Do nothing (y := 1 * y) } - else if (tr == 'T' || tr == 't') { + else if (tr == 'T') { // transpose, and not conj transpose team_policy_type team (A.extent(1), Kokkos::AUTO); using functor_type = TwoLevelTransposeGEMV()>::type* = nullptr> +void +generalGemvImpl (const char trans[], + typename AViewType::const_value_type& alpha, + const AViewType& A, + const XViewType& x, + typename YViewType::const_value_type& beta, + const YViewType& y) +{ + singleLevelGemv (trans, alpha, A, x, beta, y); +} + +template()>::type* = nullptr> +void +generalGemvImpl (const char trans[], + typename AViewType::const_value_type& alpha, + const AViewType& A, + const XViewType& x, + typename YViewType::const_value_type& beta, + const YViewType& y) +{ + twoLevelGemv (trans, alpha, A, x, beta, y); +} + } // namespace Impl } // namespace KokkosBlas diff --git a/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_gemv_spec.hpp index 76d98c65bc..da7983b07a 100644 --- a/src/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -136,22 +136,12 @@ struct GEMV { // Prefer int as the index type, but use a larger type if needed. if (numRows < static_cast (INT_MAX) && numCols < static_cast (INT_MAX)) { - #if 1 - twoLevelGemv + generalGemvImpl (trans, alpha, A, x, beta, y); - #else - singleLevelGemv - (trans, alpha, A, x, beta, y); - #endif } else { - #if 1 - twoLevelGemv - (trans, alpha, A, x, beta, y); - #else - singleLevelGemv + generalGemvImpl (trans, alpha, A, x, beta, y); - #endif } Kokkos::Profiling::popRegion(); } diff --git a/unit_test/blas/Test_Blas2_gemv.hpp b/unit_test/blas/Test_Blas2_gemv.hpp index 9ae63b5f8f..b7fd1870a9 100644 --- a/unit_test/blas/Test_Blas2_gemv.hpp +++ b/unit_test/blas/Test_Blas2_gemv.hpp @@ -26,7 +26,7 @@ namespace Test { ScalarA alpha = 3; ScalarX beta = 5; - double eps = (std::is_same::mag_type, float>::value ? 1e-3 : 1e-10); + double eps = (std::is_same::mag_type, float>::value ? 1e-3 : 3e-10); int ldx; int ldy; @@ -115,26 +115,35 @@ namespace Test { KokkosBlas::gemv(mode, alpha, A, x, beta, y); Kokkos::deep_copy(h_b_y, b_y); + int numErrors = 0; for(int i = 0; i < ldy; i++) { - EXPECT_NEAR_KK(expected(i), h_y(i), eps * expected(i)); + if(KAT::abs(expected(i) - h_y(i)) > KAT::abs(eps * expected(i))) + numErrors++; } + EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha,A ,c_x, beta, y); Kokkos::deep_copy(h_b_y, b_y); + numErrors = 0; for(int i = 0; i < ldy; i++) { - EXPECT_NEAR_KK(expected(i), h_y(i), eps); + if(KAT::abs(expected(i) - h_y(i)) > KAT::abs(eps * expected(i))) + numErrors++; } + EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y); Kokkos::deep_copy(h_b_y, b_y); + numErrors = 0; for(int i = 0; i < ldy; i++) { - EXPECT_NEAR_KK(expected(i), h_y(i), eps); + if(KAT::abs(expected(i) - h_y(i)) > KAT::abs(eps * expected(i))) + numErrors++; } + EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; } } @@ -156,8 +165,11 @@ int test_gemv(const char* mode) { Test::impl_test_gemv(mode,200,10); #endif Test::impl_test_gemv(mode,0,1024); + Test::impl_test_gemv(mode,1024,0); + Test::impl_test_gemv(mode,13,13); Test::impl_test_gemv(mode,13,1024); Test::impl_test_gemv(mode,1024,1024); + Test::impl_test_gemv(mode,4321,4321); //Test::impl_test_gemv(mode,132231,1024); #endif @@ -166,8 +178,11 @@ int test_gemv(const char* mode) { typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; Test::impl_test_gemv(mode,0,1024); + Test::impl_test_gemv(mode,1024,0); + Test::impl_test_gemv(mode,13,13); Test::impl_test_gemv(mode,13,1024); Test::impl_test_gemv(mode,1024,1024); + Test::impl_test_gemv(mode,4321,4321); //Test::impl_test_gemv(mode,132231,1024); #endif @@ -176,8 +191,11 @@ int test_gemv(const char* mode) { typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; Test::impl_test_gemv(mode,0,1024); + Test::impl_test_gemv(mode,1024,0); + Test::impl_test_gemv(mode,13,13); Test::impl_test_gemv(mode,13,1024); Test::impl_test_gemv(mode,1024,1024); + Test::impl_test_gemv(mode,4321,4321); //Test::impl_test_gemv(mode,132231,1024); #endif