kokkos · e10harvey · Jul 28, 2021 · Jul 13, 2021 · Jul 13, 2021 · Jul 13, 2021
diff --git a/src/batched/KokkosBatched_Gemm_Decl.hpp b/src/batched/KokkosBatched_Gemm_Decl.hpp
diff --git a/src/batched/KokkosBatched_Gemm_Handle.hpp b/src/batched/KokkosBatched_Gemm_Handle.hpp
@@ -0,0 +1,181 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+//
+// Created by Harvey, Evan on 7/13/21.
+//
+
+#ifndef KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP
+#define KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP
+
+#include "KokkosBatched_Kernel_Handle.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Tpl algorithm types. See BatchedGemmHandle for details.
+namespace GemmTplAlgos {
+enum GEMM_TPL_ALGOS : int { CUBLAS = N_BASE_ALGOS, MAGMA, N };
+}
+
+/// \brief KokkosBatched algorithm types. See BatchedGemmHandle for details.
+namespace GemmKokkosBatchedAlgos {
+enum GEMM_KOKKOS_BATCHED_ALGOS : int {
+  KK_TEAM = GemmTplAlgos::N,
+  KK_TEAMVECTOR,
+  KK_SERIALSIMD,
+  KK_TEAMSIMD,
+  KK_SERIAL_RANK0,
+  KK_SERIAL_SHMEM,
+  KK_DBLBUF,
+  N
+};
+}
+
+// clang-format off
+/// \brief Handle for selecting runtime behavior of the BatchedGemm interface.
+///
+/// \var kernelAlgoType  Specifies which algorithm to use for invocation (default, SQUARE).
+///
+///                    Specifies whether to select optimal invocations based on inputs and
+///                    heuristics:
+///                      SQUARE select invocations based on square matrix heuristics where M=N
+///                      TALL   select invocations based on tall   matrix heuristics where M>N
+///                      WIDE   select invocations based on wide   matrix heuristics where M<N
+///                    Note: If the heuristics indicate SIMD views are required for optimal
+///                    performance, notify the user that SIMD views are required for
+///                    optimal performance.
+///
+///                    Specifies which cmake-enabled tpl algorithm to invoke:
+///                      ARMPL    Invoke the ArmPL TPL interface
+///                      MKL      Invoke the MKL TPL interface
+///                      CUBLAS   Invoke the CuBLAS TPL interface
+///                      MAGMA    Invoke the Magma TPL interface
+///                    Note: Requires that input views for A, B, and C reside on either host
+///                    or device depending on the TPL selected.
+///                    Note: If the user selects a TPL, an error will be thrown if:
+///                       1. The TPL is not enabled via cmake
+///                       2. The input views do not reside on the host/device as needed
+///
+///                    Specifies which kokkos-kernels (KK) algorithm to invoke:
+///                      KK_SERIAL       Invoke SerialGemm     via RangePolicy(BatchSz)
+///                      KK_TEAM         Invoke TeamGemm       via TeamPolicy(BatchSz)
+///                      KK_TEAMVECTOR   Invoke TeamVectorGemm via TeamPolicy(BatchSz)
+///                      KK_SERIALSIMD   Invoke SerialGemm     via TeamPolicy(BatchSz)
+///                      KK_TEAMSIMD     Invoke TeamGemm       via TeamPolicy(BatchSz)
+///                      KK_SERIAL_RANK0 Invoke SerialGemm     via RangePolicy(BatchSz*N*M)
+///                                      Each thread computes one element of C.
+///                      KK_SERIAL_SHMEM Invoke SerialGemm     via TeamPolicy(BatchSz)
+///                                      Copies A and B to shared memory before GEMM.
+///                                      Each vector lane solves one element of C via SerialGemm.
+///                      KK_DBLBUF       Solve GEMM            via TeamPolicy(BatchSz*TILES)
+///                                      Uses a tuned functor with tiling and double buffering
+///                                      via shared memory and register buffers.
+///                                      KK_DBLBUF generally performs better on GPUs when M, N >= 24.
+/// \var teamSz        Specifies the team size that will affect any KK algorithm which uses
+///                    TeamPolicy (default, Kokkos::AUTO).
+///                    Note: Only applied if useAlgo_type == KK_*
+/// \var vecLen        Specifies the vector length that will affect any KK algorithm which
+///                    uses TeamPolicy and Kokkos::ThreadVectorRange or Kokkos::TeamVectorRange
+///                    (default, Kokkos::AUTO).
+///                    Note: Only applied if useAlgo_type == KK_*
+// clang-format on
+class BatchedGemmHandle : public BatchedKernelHandle {
+ public:
+  BatchedGemmHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE,
+                    int teamSize = 0, int vecLength = 0)
+      : BatchedKernelHandle(kernelAlgoType, teamSize, vecLength) {
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+    if (!_tplParamsSet && kernelAlgoType == GemmTplAlgos::CUBLAS) {
+      static cublasHandle_t cublas_handle;
+      _tplParamsSingleton.cublas_handle = &cublas_handle;
+      _tplParamsSet                     = true;
+    }
+#endif  // CUBLAS
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+    if (!_tplParamsSet && kernelAlgoType == GemmTplAlgos::MAGMA) {
+      static magma_queue_t magma_queue;
+      _tplParamsSingleton.magma_queue = &magma_queue;
+      _tplParamsSet                   = true;
+    }
+#endif  // MAGMA
+  };
+
+  BatchedGemmHandle(bool tplParamsSet,
+                    int kernelAlgoType = BaseHeuristicAlgos::SQUARE,
+                    int teamSize = 0, int vecLength = 0)
+      : BatchedKernelHandle(kernelAlgoType, teamSize, vecLength) {
+    _tplParamsSet = tplParamsSet;
+  };
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+  BatchedGemmHandle(cublasHandle_t &cublas_handle,
+                    int kernelAlgoType = BaseHeuristicAlgos::SQUARE,
+                    int teamSize = 0, int vecLength = 0)
+      : BatchedGemmHandle(true, kernelAlgoType, teamSize, vecLength) {
+    _tplParamsSingleton.cublas_handle = &cublas_handle;
+  };
+#endif  // CUBLAS
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+  BatchedGemmHandle(magma_queue_t &magma_queue,
+                    int kernelAlgoType = BaseHeuristicAlgos::SQUARE,
+                    int teamSize = 0, int vecLength = 0)
+      : BatchedGemmHandle(true, kernelAlgoType, teamSize, vecLength) {
+    _tplParamsSingleton.magma_queue = &magma_queue;
+  };
+#endif  // MAGMA
+
+  decltype(auto) get_tpl_params() {
+#if _kernelAlgoType == CUBLAS && defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+    return _tplParamsSingleton.cublas_handle;
+#elif _kernelAlgoType == MAGMA && defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+    return _tplParamsSingleton.magma_queue;
+#else
+    return this->BatchedKernelHandle::get_tpl_params();
+#endif
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif  // KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP
diff --git a/src/batched/KokkosBatched_Gemm_Serial_Impl.hpp b/src/batched/KokkosBatched_Gemm_Serial_Impl.hpp
@@ -1,8 +1,47 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
 #ifndef __KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP__
 
-/// \author Kyungjoo Kim (kyukim@sandia.gov)
-
 #include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Gemm_Serial_Internal.hpp"
 
@@ -342,24 +381,20 @@ SerialGemm<Trans::Transpose, Trans::Transpose, Algo::Gemm::Blocked>::invoke(
 /********************* END functor-level routines *********************/
 
 /********************* BEGIN non-functor-level routines *********************/
-namespace Experimental {
-template <class ScalarType, class AViewType, class BViewType, class CViewType,
-          class ArgTransA, class ArgTransB, class ArgMode, class ArgBatchLayout,
-          class ArgResultsPerThread>
-struct BatchedSerialGemmFunctor {
+template <class ArgTransA, class ArgTransB, class ArgMode, class ArgBatchSzDim,
+          class ArgResultsPerThread, class ScalarType, class AViewType,
+          class BViewType, class CViewType>
+class BatchedSerialGemm {
+ private:
   AViewType A;
   BViewType B;
   CViewType C;
   ScalarType alpha, beta;
   size_t divisor, c_cols, batch_size;
-  ArgBatchLayout batch_layout_tag;
+  ArgBatchSzDim batch_layout_tag;
   ArgTransA transA_tag;
   ArgTransB transB_tag;
 
-  BatchedSerialGemmFunctor(ScalarType _alpha, AViewType _A, BViewType _B,
-                           ScalarType _beta, CViewType _C)
-      : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {}
-
   // subview_wrapper overloads for handling 3-rank BatchLayout::Left views
   template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
   KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
@@ -400,6 +435,52 @@ struct BatchedSerialGemmFunctor {
     return subview_wrapper(v, i1, i3, i2, layout_tag);
   }
 
+  void run() {
+    using execution_space = typename CViewType::device_type::execution_space;
+    using policy_type =
+        Kokkos::RangePolicy<ArgResultsPerThread, execution_space>;
+    Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size),
+                         *this);
+  }
+
+ public:
+  int invoke() {
+    if (std::is_same<ArgResultsPerThread, ResultsPerThread::Rank0>::value) {
+      // Set members for ResultsPerThread::Rank0 operator; these members allow
+      // each thread to calculate its C output index
+      if (std::is_same<ArgBatchSzDim, BatchLayout::Left>::value) {
+        batch_size = C.extent(0);
+        divisor    = C.extent(1) * C.extent(2);
+        c_cols     = C.extent(2);
+      } else {
+        batch_size = C.extent(2);
+        divisor    = C.extent(0) * C.extent(1);
+        c_cols     = C.extent(1);
+      }
+
+      // Increase the number of threads by the divisor
+      batch_size *= divisor;
+
+      run();
+    } else if (std::is_same<ArgResultsPerThread,
+                            ResultsPerThread::Rank2>::value) {
+      if (std::is_same<ArgBatchSzDim, BatchLayout::Left>::value)
+        batch_size = C.extent(0);
+      else
+        batch_size = C.extent(2);
+
+      run();
+    } else {
+      std::cerr << "Error: ArgResultsPerThread not supported" << std::endl;
+      return -1;
+    }
+    return 0;
+  }
+
+  BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B,
+                    ScalarType _beta, CViewType _C)
+      : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {}
+
   KOKKOS_INLINE_FUNCTION
   void operator()(const ResultsPerThread::Rank0 &, const int &i) const {
     // Here, the batch_idx is strided by c_rows * c_cols
@@ -442,67 +523,8 @@ struct BatchedSerialGemmFunctor {
     KokkosBatched::SerialGemm<ArgTransA, ArgTransB, ArgMode>::invoke(
         alpha, svA, svB, beta, svC);
   }
-
-  void run() {
-    using execution_space = typename CViewType::device_type::execution_space;
-    using policy_type =
-        Kokkos::RangePolicy<ArgResultsPerThread, execution_space>;
-    Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size),
-                         *this);
-  }
-};
-
-template <class ArgTransA, class ArgTransB, class ArgMode, class ArgBatchLayout,
-          class ArgResultsPerThread>
-struct BatchedSerialGemm {
-  template <class ScalarType, class AViewType, class BViewType, class CViewType>
-  static int invoke(const ScalarType alpha, const AViewType &A,
-                    const BViewType &B, const ScalarType beta,
-                    const CViewType &C) {
-    if (std::is_same<ArgResultsPerThread, ResultsPerThread::Rank0>::value) {
-      BatchedSerialGemmFunctor<ScalarType, AViewType, BViewType, CViewType,
-                               ArgTransA, ArgTransB, ArgMode, ArgBatchLayout,
-                               ArgResultsPerThread>
-          functor(alpha, A, B, beta, C);
-
-      // Set members for ResultsPerThread::Rank0 operator; these members allow
-      // each thread to calculate its C output index
-      if (std::is_same<ArgBatchLayout, BatchLayout::Left>::value) {
-        functor.batch_size = C.extent(0);
-        functor.divisor    = C.extent(1) * C.extent(2);
-        functor.c_cols     = C.extent(2);
-      } else {
-        functor.batch_size = C.extent(2);
-        functor.divisor    = C.extent(0) * C.extent(1);
-        functor.c_cols     = C.extent(1);
-      }
-
-      // Increase the number of threads by the divisor
-      functor.batch_size *= functor.divisor;
-
-      functor.run();
-    } else if (std::is_same<ArgResultsPerThread,
-                            ResultsPerThread::Rank2>::value) {
-      using argTransA = ArgTransA;
-      BatchedSerialGemmFunctor<ScalarType, AViewType, BViewType, CViewType,
-                               argTransA, ArgTransB, ArgMode, ArgBatchLayout,
-                               ArgResultsPerThread>
-          functor(alpha, A, B, beta, C);
-      if (std::is_same<ArgBatchLayout, BatchLayout::Left>::value)
-        functor.batch_size = C.extent(0);
-      else
-        functor.batch_size = C.extent(2);
-
-      functor.run();
-    } else {
-      std::cerr << "Error: ArgResultsPerThread not supported" << std::endl;
-      return -1;
-    }
-    return 0;
-  }
 };
 /********************* END non-functor-level routines *********************/
-}  // namespace Experimental
 }  // namespace KokkosBatched
 
 #endif