diff --git a/.gitmodules b/.gitmodules index 979195348..e9fbbbd46 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,7 +14,7 @@ url = https://github.com/kokkos/kokkos.git [submodule "vendor/kokkos-kernels"] path = vendor/kokkos-kernels - url = https://github.com/yasahi-hpc/kokkos-kernels.git + url = https://github.com/kokkos/kokkos-kernels.git [submodule "vendor/doxygen-awesome-css"] path = vendor/doxygen-awesome-css url = https://github.com/jothepro/doxygen-awesome-css.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bba7b20d..4b21db53b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,9 +208,7 @@ if("${DDC_BUILD_KERNELS_SPLINES}") ) if("${DDC_KokkosKernels_DEPENDENCY_POLICY}" STREQUAL "AUTO") if(NOT TARGET Kokkos::kokkoskernels) - # fork https://github.com/yasahi-hpc/kokkos-kernels - # on branch develop-spline-kernels-v2 - find_package(KokkosKernels QUIET) + find_package(KokkosKernels 4.5...<5 QUIET) if(NOT KokkosKernels_FOUND) ddc_configure_kokkos_kernels() endif() @@ -218,9 +216,7 @@ if("${DDC_BUILD_KERNELS_SPLINES}") elseif("${DDC_KokkosKernels_DEPENDENCY_POLICY}" STREQUAL "EMBEDDED") ddc_configure_kokkos_kernels() elseif("${DDC_KokkosKernels_DEPENDENCY_POLICY}" STREQUAL "INSTALLED") - # fork https://github.com/yasahi-hpc/kokkos-kernels - # on branch develop-spline-kernels-v2 - find_package(KokkosKernels REQUIRED) + find_package(KokkosKernels 4.5...<5 REQUIRED) endif() add_library(ddc_splines INTERFACE) diff --git a/README.md b/README.md index fcda4dcf2..b431705fd 100644 --- a/README.md +++ b/README.md @@ -48,8 +48,7 @@ To use DDC components, one needs the following dependencies: * PDI 1.6...<2 * (optional, spline interpolation) DDC::splines * Ginkgo 1.8.0 - * Kokkos Kernels fork on branch develop-spline-kernels-v2 - + * Kokkos Kernels 4.5...<5 ## Getting the code and basic configuration diff --git a/cmake/DDCConfig.cmake.in b/cmake/DDCConfig.cmake.in index 0e9c74b28..15805c318 100644 --- a/cmake/DDCConfig.cmake.in +++ b/cmake/DDCConfig.cmake.in @@ -28,9 +28,7 @@ if(@DDC_BUILD_KERNELS_SPLINES@) list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}) ddc_find_dependency(LAPACKE) list(POP_FRONT CMAKE_MODULE_PATH) - # fork https://github.com/yasahi-hpc/kokkos-kernels - # on branch develop-spline-kernels-v2 - ddc_find_dependency(KokkosKernels) + ddc_find_dependency(KokkosKernels 4.5...<5) endif() if(@DDC_BUILD_PDI_WRAPPER@) diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 21f8a9648..d0eebe5cf 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -21,7 +21,7 @@ set(DOXYGEN_EXAMPLE_PATH "${DDC_SOURCE_DIR}/examples") set(DOXYGEN_EXPAND_ONLY_PREDEF YES) set(DOXYGEN_EXTRACT_ALL YES CACHE STRING "") set(DOXYGEN_EXCLUDE_SYMBOLS "detail") -set(DOXYGEN_EXCLUDE_PATTERNS "*/experimental/*;*/detail/*") +set(DOXYGEN_EXCLUDE_PATTERNS "*/detail/*;*/experimental/*;*/kokkos-kernels-ext/*") set(DOXYGEN_EXTRACT_LOCAL_CLASSES YES) set(DOXYGEN_FULL_PATH_NAMES NO) set(DOXYGEN_HTML_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/html") diff --git a/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Gbtrs.hpp b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Gbtrs.hpp new file mode 100644 index 000000000..d1c1c6d31 --- /dev/null +++ b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Gbtrs.hpp @@ -0,0 +1,59 @@ +// Copyright (C) The DDC development team, see COPYRIGHT.md file +// +// SPDX-License-Identifier: MIT + +// clang-format off +// NOLINTBEGIN(*) + +#ifndef KOKKOSBATCHED_GBTRS_HPP_ +#define KOKKOSBATCHED_GBTRS_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Gbtrs: +/// +/// Solve A_l x_l = b_l for all l = 0, ..., N +/// with a general band matrix A using the LU factorization computed +/// by gbtrf. +/// +/// \tparam AViewType: Input type for the matrix, needs to be a 2D view +/// \tparam BViewType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// \tparam PivViewType: Integer type for pivot indices, needs to be a 1D view +/// +/// \param A [in]: A is a ldab by n banded matrix. +/// Details of the LU factorization of the band matrix A, as computed by +/// gbtrf. U is stored as an upper triangular band matrix with KL+KU +/// superdiagonals in rows 1 to KL+KU+1, and the multipliers used during +/// the factorization are stored in rows KL+KU+2 to 2*KL+KU+1. +/// \param b [inout]: right-hand side and the solution +/// \param piv [in]: The pivot indices; for 1 <= i <= N, row i of the matrix +/// was interchanged with row piv(i). +/// \param kl [in]: kl specifies the number of subdiagonals within the band +/// of A. kl >= 0 +/// \param ku [in]: ku specifies the number of superdiagonals within the band +/// of A. ku >= 0 +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialGbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const BViewType &b, + const PivViewType &piv, const int kl, + const int ku); +}; +} // namespace KokkosBatched + +#include "KokkosBatched_Gbtrs_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_GBTRS_HPP_ + +// NOLINTEND(*) +// clang-format on diff --git a/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Gbtrs_Serial_Impl.hpp b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Gbtrs_Serial_Impl.hpp new file mode 100644 index 000000000..2625434f1 --- /dev/null +++ b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Gbtrs_Serial_Impl.hpp @@ -0,0 +1,170 @@ +// Copyright (C) The DDC development team, see COPYRIGHT.md file +// +// SPDX-License-Identifier: MIT + +// clang-format off +// NOLINTBEGIN(*) + +#ifndef KOKKOSBATCHED_GBTRS_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_GBTRS_SERIAL_IMPL_HPP_ + +#include +#include +#include +#include + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkGbtrsInput( + [[maybe_unused]] const AViewType &A, [[maybe_unused]] const BViewType &b, + [[maybe_unused]] const int kl, [[maybe_unused]] const int ku) { + static_assert(Kokkos::is_view_v, + "KokkosBatched::gbtrs: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, + "KokkosBatched::gbtrs: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::gbtrs: AViewType must have rank 2."); + static_assert(BViewType::rank == 1, + "KokkosBatched::gbtrs: BViewType must have rank 1."); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (kl < 0) { + Kokkos::printf( + "KokkosBatched::gbtrs: input parameter kl must not be less than 0: kl " + "= " + "%d\n", + kl); + return 1; + } + + if (ku < 0) { + Kokkos::printf( + "KokkosBatched::gbtrs: input parameter ku must not be less than 0: ku " + "= " + "%d\n", + ku); + return 1; + } + + const int lda = A.extent(0), n = A.extent(1); + if (lda < (2 * kl + ku + 1)) { + Kokkos::printf( + "KokkosBatched::gbtrs: leading dimension of A must be smaller than 2 * " + "kl + ku + 1: " + "lda = %d, kl = %d, ku = %d\n", + lda, kl, ku); + return 1; + } + + const int ldb = b.extent(0); + if (ldb < Kokkos::max(1, n)) { + Kokkos::printf( + "KokkosBatched::gbtrs: leading dimension of b must be smaller than " + "max(1, n): " + "ldb = %d, n = %d\n", + ldb, n); + return 1; + } + +#endif + return 0; +} + +//// Non-transpose //// +template <> +struct SerialGbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const BViewType &b, + const PivViewType &piv, const int kl, + const int ku) { + // Quick return if possible + const int n = A.extent(1); + if (n == 0) return 0; + + auto info = checkGbtrsInput(A, b, kl, ku); + if (info) return info; + + bool lonti = kl > 0; + const int kd = ku + kl + 1; + if (lonti) { + for (int j = 0; j < n - 1; ++j) { + const int lm = Kokkos::min(kl, n - j - 1); + auto l = piv(j); + // If pivot index is not j, swap rows l and j in b + if (l != j) { + Kokkos::kokkos_swap(b(l), b(j)); + } + + // Perform a rank-1 update of the remaining part of the current column + // (ger) + for (int i = 0; i < lm; ++i) { + b(j + 1 + i) = b(j + 1 + i) - A(kd + i, j) * b(j); + } + } + } + + // Solve U*X = b for each right hand side, overwriting B with X. + [[maybe_unused]] auto info_tbsv = + KokkosBatched::SerialTbsv::invoke(A, b, kl + ku); + + return 0; + } +}; + +//// Transpose //// +template <> +struct SerialGbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const BViewType &b, + const PivViewType &piv, const int kl, + const int ku) { + // Quick return if possible + const int n = A.extent(1); + if (n == 0) return 0; + + auto info = checkGbtrsInput(A, b, kl, ku); + if (info) return info; + + bool lonti = kl > 0; + const int kd = ku + kl + 1; + + // Solve U*X = b for each right hand side, overwriting B with X. + [[maybe_unused]] auto info_tbsv = + KokkosBatched::SerialTbsv::invoke(A, b, kl + ku); + + if (lonti) { + for (int j = n - 2; j >= 0; --j) { + const int lm = Kokkos::min(kl, n - j - 1); + + // Gemv transposed + auto a = Kokkos::subview(b, Kokkos::pair(j + 1, j + 1 + lm)); + auto x = Kokkos::subview(A, Kokkos::pair(kd, kd + lm), j); + auto y = Kokkos::subview(b, Kokkos::pair(j, j + lm)); + + [[maybe_unused]] auto info_gemv = + KokkosBlas::Impl::SerialGemvInternal::invoke( + 1, a.extent(0), -1.0, a.data(), a.stride_0(), a.stride_0(), + x.data(), x.stride_0(), 1.0, y.data(), y.stride_0()); + + // If pivot index is not j, swap rows l and j in b + auto l = piv(j); + if (l != j) { + Kokkos::kokkos_swap(b(l), b(j)); + } + } + } + + return 0; + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_GBTRS_SERIAL_IMPL_HPP_ + +// NOLINTEND(*) +// clang-format on diff --git a/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Getrs.hpp b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Getrs.hpp new file mode 100644 index 000000000..633b90b8a --- /dev/null +++ b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Getrs.hpp @@ -0,0 +1,48 @@ +// Copyright (C) The DDC development team, see COPYRIGHT.md file +// +// SPDX-License-Identifier: MIT + +// clang-format off +// NOLINTBEGIN(*) + +#ifndef KOKKOSBATCHED_GETRS_HPP_ +#define KOKKOSBATCHED_GETRS_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Getrs: +/// Solve a system of linear equations +/// A * x = b or A**T * x = b +/// with a general N-by-N matrix A using LU factorization computed +/// by Getrf. +/// \tparam AViewType: Input type for the matrix, needs to be a 2D view +/// \tparam PivViewType: Input type for the pivot indices, needs to be a 1D view +/// \tparam BViewType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param A [inout]: A is a m by n general matrix, a rank 2 view +/// \param piv [out]: On exit, the pivot indices, a rank 1 view +/// \param B [inout]: right-hand side and the solution, a rank 1 view +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialGetrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const PivViewType &piv, + const BViewType &b); +}; +} // namespace KokkosBatched + +#include "KokkosBatched_Getrs_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_GETRS_HPP_ + +// NOLINTEND(*) +// clang-format on diff --git a/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Getrs_Serial_Impl.hpp b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Getrs_Serial_Impl.hpp new file mode 100644 index 000000000..4840c1280 --- /dev/null +++ b/include/ddc/kernels/splines/kokkos-kernels-ext/KokkosBatched_Getrs_Serial_Impl.hpp @@ -0,0 +1,99 @@ +// Copyright (C) The DDC development team, see COPYRIGHT.md file +// +// SPDX-License-Identifier: MIT + +// clang-format off +// NOLINTBEGIN(*) + +#ifndef KOKKOSBATCHED_GETRS_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_GETRS_SERIAL_IMPL_HPP_ + +#include +#include +#include + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkGetrsInput([[maybe_unused]] const AViewType &A, + [[maybe_unused]] const BViewType &b) { + static_assert(Kokkos::is_view::value, "KokkosBatched::getrs: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::getrs: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::getrs: AViewType must have rank 2."); + static_assert(BViewType::rank == 1, "KokkosBatched::getrs: BViewType must have rank 1."); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int lda = A.extent(0), n = A.extent(1); + if (lda < Kokkos::max(1, n)) { + Kokkos::printf( + "KokkosBatched::getrs: the leading dimension of the array A must " + "satisfy lda >= max(1, n): A: " + "%d " + "x %d \n", + lda, n); + return 1; + } + + const int ldb = b.extent(0); + if (ldb < Kokkos::max(1, n)) { + Kokkos::printf( + "KokkosBatched::getrs: the leading dimension of the array b must " + "satisfy ldb >= max(1, n): b: %d, A: " + "%d " + "x %d \n", + ldb, lda, n); + return 1; + } +#endif + return 0; +} + +//// Non-transpose //// +template <> +struct SerialGetrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const PivViewType &piv, const BViewType &b) { + // quick return if possible + if (A.extent(1) == 0) return 0; + + auto info = checkGetrsInput(A, b); + if (info) return info; + + [[maybe_unused]] auto info_laswp = KokkosBatched::SerialLaswp::invoke(piv, b); + + [[maybe_unused]] auto info_trsm = KokkosBatched::SerialTrsm::invoke(1.0, A, b); + info_trsm = KokkosBatched::SerialTrsm::invoke(1.0, A, b); + + return 0; + } +}; + +//// Transpose //// +template <> +struct SerialGetrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const PivViewType &piv, const BViewType &b) { + // quick return if possible + if (A.extent(1) == 0) return 0; + + auto info = checkGetrsInput(A, b); + if (info) return info; + + [[maybe_unused]] auto info_trsm = KokkosBatched::SerialTrsm::invoke(1.0, A, b); + info_trsm = + KokkosBatched::SerialTrsm::invoke( + 1.0, A, b); + + [[maybe_unused]] auto info_laswp = KokkosBatched::SerialLaswp::invoke(piv, b); + + return 0; + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_GETRS_SERIAL_IMPL_HPP_ + +// NOLINTEND(*) +// clang-format on diff --git a/include/ddc/kernels/splines/splines_linear_problem_band.hpp b/include/ddc/kernels/splines/splines_linear_problem_band.hpp index 2ddcc87a7..3d674b729 100644 --- a/include/ddc/kernels/splines/splines_linear_problem_band.hpp +++ b/include/ddc/kernels/splines/splines_linear_problem_band.hpp @@ -20,9 +20,10 @@ #include #endif -#include #include +#include "kokkos-kernels-ext/KokkosBatched_Gbtrs.hpp" + #include "splines_linear_problem.hpp" namespace ddc::detail { @@ -189,7 +190,7 @@ class SplinesLinearProblemBand : public SplinesLinearProblem auto sub_b = Kokkos::subview(b, Kokkos::ALL, i); KokkosBatched::SerialGbtrs< KokkosBatched::Trans::Transpose, - KokkosBatched::Algo::Gbtrs::Unblocked>:: + KokkosBatched::Algo::Level3::Unblocked>:: invoke(q_device, sub_b, ipiv_device, kl_proxy, ku_proxy); }); } else { @@ -200,7 +201,7 @@ class SplinesLinearProblemBand : public SplinesLinearProblem auto sub_b = Kokkos::subview(b, Kokkos::ALL, i); KokkosBatched::SerialGbtrs< KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gbtrs::Unblocked>:: + KokkosBatched::Algo::Level3::Unblocked>:: invoke(q_device, sub_b, ipiv_device, kl_proxy, ku_proxy); }); } diff --git a/include/ddc/kernels/splines/splines_linear_problem_dense.hpp b/include/ddc/kernels/splines/splines_linear_problem_dense.hpp index 0a5616049..8ee0c22b9 100644 --- a/include/ddc/kernels/splines/splines_linear_problem_dense.hpp +++ b/include/ddc/kernels/splines/splines_linear_problem_dense.hpp @@ -18,9 +18,10 @@ #include #endif -#include #include +#include "kokkos-kernels-ext/KokkosBatched_Getrs.hpp" + #include "splines_linear_problem.hpp" namespace ddc::detail { @@ -132,7 +133,7 @@ class SplinesLinearProblemDense : public SplinesLinearProblem auto sub_b = Kokkos::subview(b, Kokkos::ALL, i); KokkosBatched::SerialGetrs< KokkosBatched::Trans::Transpose, - KokkosBatched::Algo::Getrs::Unblocked>:: + KokkosBatched::Algo::Level3::Unblocked>:: invoke(a_device, ipiv_device, sub_b); }); } else { @@ -143,7 +144,7 @@ class SplinesLinearProblemDense : public SplinesLinearProblem auto sub_b = Kokkos::subview(b, Kokkos::ALL, i); KokkosBatched::SerialGetrs< KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Getrs::Unblocked>:: + KokkosBatched::Algo::Level3::Unblocked>:: invoke(a_device, ipiv_device, sub_b); }); } diff --git a/vendor/kokkos-kernels b/vendor/kokkos-kernels index 6149b341a..6e2ba940f 160000 --- a/vendor/kokkos-kernels +++ b/vendor/kokkos-kernels @@ -1 +1 @@ -Subproject commit 6149b341a61410d01ab7fa977a267b778bc7fd36 +Subproject commit 6e2ba940f5c8efc3ad1d7aca8cf4313073a186de