diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index d6ce98dae9..816d68e443 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -304,6 +304,13 @@ KOKKOSKERNELS_GENERATE_ETI(Blas2_syr syr TYPE_LISTS FLOATS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Blas2_syr2 syr2 + COMPONENTS blas + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm COMPONENTS blas HEADER_LIST ETI_HEADERS diff --git a/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in b/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..669b5fd1aa --- /dev/null +++ b/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosBlas2_syr2_spec.hpp" + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_INST_BLOCK@ +} //IMPL +} //Kokkos diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..9e7a01653e --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_AVAIL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/impl/KokkosBlas2_syr2_impl.hpp b/blas/impl/KokkosBlas2_syr2_impl.hpp new file mode 100644 index 0000000000..69284e9547 --- /dev/null +++ b/blas/impl/KokkosBlas2_syr2_impl.hpp @@ -0,0 +1,369 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_IMPL_HPP_ +#define KOKKOSBLAS2_SYR2_IMPL_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosBlas { +namespace Impl { + +// Functor for the thread parallel version of SYR2. +// This functor parallelizes over rows of the input matrix A. +template +struct ThreadParallelSYR2 { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + ThreadParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) + : alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else if ((x_(i) == Kokkos::ArithTraits::zero()) && + (y_(i) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const XComponentType x_fixed(x_(i)); + const YComponentType y_fixed(y_(i)); + const IndexType N(A_.extent(1)); + + if constexpr (tJustTranspose) { + if (x_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + } + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + } + } + } else { + if (x_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + } + } + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + } + } + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Thread parallel version of SYR2. +template +void threadParallelSyr2(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + } else if (y.extent(0) == 0) { + // no entries to update + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + } else { + Kokkos::RangePolicy rangePolicy(space, 0, + A.extent(0)); + ThreadParallelSYR2 + functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[threadParallel]", rangePolicy, + functor); + } +} + +struct TeamParallelSYR2_LayoutLeftTag {}; +struct TeamParallelSYR2_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- + +// Functor for the team parallel version of SYR2, designed for +// performance on GPUs. The kernel depends on the layout of A. +template +struct TeamParallelSYR2 { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TeamParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) + : alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + public: + // LayoutLeft version: one team per column + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutLeftTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType j(team.league_rank()); + if ((x_(j) == Kokkos::ArithTraits::zero()) && + (y_(j) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const IndexType M(A_.extent(0)); + if constexpr (tJustTranspose) { + const XComponentType x_fixed(x_(j)); + const YComponentType y_fixed(y_(j)); + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); + } + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_(i) * x_fixed); + } + }); + } + } else { + const XComponentType x_fixed( + Kokkos::ArithTraits::conj(x_(j))); + const YComponentType y_fixed( + Kokkos::ArithTraits::conj(y_(j))); + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); + } + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * + y_(i) * x_fixed); + } + }); + } + } + } + } + } + + // LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutRightTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType i(team.league_rank()); + if ((x_(i) == Kokkos::ArithTraits::zero()) && + (y_(i) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); + const YComponentType y_fixed(y_(i)); + if constexpr (tJustTranspose) { + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + }); + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + }); + } + } else { + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + } + }); + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * + y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); + } + } + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Team parallel version of SYR2. +template +void teamParallelSyr2(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + return; + } else if (y.extent(0) == 0) { + // no entries to update + return; + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + return; + } + + constexpr bool isLayoutLeft = + std::is_same::value; + using layout_tag = + typename std::conditional::type; + using TeamPolicyType = Kokkos::TeamPolicy; + TeamPolicyType teamPolicy; + if (isLayoutLeft) { + // LayoutLeft: one team per column + teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); + } else { + // LayoutRight: one team per row + teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); + } + + TeamParallelSYR2 + functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[teamParallel]", teamPolicy, functor); +} + +// --------------------------------------------------------------------------------------------- + +// generalSyr2Impl(): +// - use thread parallel code (rangePolicy) if execution space is CPU; +// - use team parallel code (teamPolicy) if execution space is GPU. +// +// The 'enable_if' makes sure unused kernels are not instantiated. + +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + threadParallelSyr2(space, alpha, x, y, A); +} + +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + teamParallelSyr2(space, alpha, x, y, A); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas2_syr2_spec.hpp b/blas/impl/KokkosBlas2_syr2_spec.hpp new file mode 100644 index 0000000000..01637ba1d4 --- /dev/null +++ b/blas/impl/KokkosBlas2_syr2_spec.hpp @@ -0,0 +1,180 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_SPEC_HPP_ +#define KOKKOSBLAS2_SYR2_SPEC_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr2_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization availability +// KokkosBlas::Impl::SYR2. This is NOT for users!!! All the declarations of full +// specializations go in this header file. We may spread out definitions (see +// _INST macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// +// syr2 +// + +// Implementation of KokkosBlas::syr2. +template < + class ExecutionSpace, class XViewType, class YViewType, class AViewType, + bool tpl_spec_avail = syr2_tpl_spec_avail::value, + bool eti_spec_avail = syr2_eti_spec_avail::value> +struct SYR2 { + static void syr2(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::syr2[ETI]" + : "KokkosBlas::syr2[noETI]"); + + typedef typename AViewType::size_type size_type; + const size_type numRows = A.extent(0); + const size_type numCols = A.extent(1); + + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); + bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); + + // Prefer int as the index type, but use a larsyr2 type if needed. + if ((numRows < static_cast(INT_MAX)) && + (numCols < static_cast(INT_MAX))) { + if (justTranspose) { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } else { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } + } else { + if (justTranspose) { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } else { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } + } + + Kokkos::Profiling::popRegion(); + } +#else + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; + +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization of KokkosBlas::Impl::SYR2. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or more .cpp +// files. +// +#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS2_SYR2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include + +#endif // KOKKOSBLAS2_SYR2_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 439ed588db..685ca75997 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -94,7 +94,7 @@ void threadParallelSyr(const ExecutionSpace& space, A.extent(0)); ThreadParallelSYR functor(alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[thredParallel]", rangePolicy, + Kokkos::parallel_for("KokkosBlas::syr[threadParallel]", rangePolicy, functor); } } diff --git a/blas/src/KokkosBlas2_syr2.hpp b/blas/src/KokkosBlas2_syr2.hpp new file mode 100644 index 0000000000..c9a2f7b2c5 --- /dev/null +++ b/blas/src/KokkosBlas2_syr2.hpp @@ -0,0 +1,236 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_HPP_ +#define KOKKOSBLAS2_SYR2_HPP_ + +#include +#include + +namespace KokkosBlas { + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// matrix A that is: +/// - symmetric, A += alpha * x * y^T + alpha * y * x^T, or +/// - Hermitian, A += alpha * x * y^H + conj(alpha) * y * x^H. +/// +/// Important note 1: this routine encapsulates the syr2() and her2() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian. +/// Moreover, this routine will always compute either the lower +/// portion or the upper portion (per user's request) of the final +/// matrix A. So, in order to obtain meaningful results, the user +/// must make sure to follow the conditions specified in the +/// "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// Important note 4: Regarding parameter types: +/// - If A has components of real type (float or double), then: +/// - alpha must be of real type as well, +/// - components of x must be of real type as well, and +/// - components of y must be of real type as well. +/// - If A has components of complex type (complex or +/// complex), then: +/// - alpha must be of complex type as well (it may have zero +/// imaginary part, no problem), +/// - components of x may be of real type or complex type, and +/// - components of y may be of real type or complex type. +/// +/// \tparam ExecutionSpace The type of execution space +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param space [in] Execution space instance on which to run the kernel. +/// This may contain information about which stream to +/// run on. +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + static_assert( + Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank()) == 2, + "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank()) == 1, + "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank()) == 1, + "YViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if ((A.extent(0) == A.extent(1)) && (A.extent(0) == x.extent(0)) && + (A.extent(0) == y.extent(0))) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas::syr2: Dimensions of A, x: " + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " + << x.extent(0) << ", y has size " << y.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas2::syr2(): invalid trans[0] = '" << trans[0] + << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || + (uplo[0] == 'l')) { + // Ok + } else { + std::ostringstream oss; + oss << "KokkosBlas2::syr2(): invalid uplo[0] = " << uplo[0] + << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; + throw std::runtime_error(oss.str()); + } + + if ((A.extent(0) == 0) || (A.extent(1) == 0)) { + return; + } + + using ALayout = typename AViewType::array_layout; + + // Minimize the number of Impl::SYR2 instantiations, by standardizing + // on particular View specializations for its template parameters. + typedef Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits > + XVT; + + typedef Kokkos::View::array_layout, + typename YViewType::device_type, + Kokkos::MemoryTraits > + YVT; + + typedef Kokkos::View > + AVT; + + Impl::SYR2::syr2(space, trans, uplo, alpha, x, + y, A); +} + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// matrix A that is: +/// - symmetric, A += alpha * x * y^T + alpha * y * x^T, or +/// - Hermitian, A += alpha * x * y^H + conj(alpha) * y * x^H. +/// +/// Important note 1: this routine encapsulates the syr2() and her2() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian. +/// Moreover, this routine will always compute either the lower +/// portion or the upper portion (per user's request) of the final +/// matrix A. So, in order to obtain meaningful results, the user +/// must make sure to follow the conditions specified in the +/// "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// Important note 4: Regarding parameter types: +/// - If A has components of real type (float or double), then: +/// - alpha must be of real type as well, +/// - components of x must be of real type as well, and +/// - components of y must be of real type as well. +/// - If A has components of complex type (complex or +/// complex), then: +/// - alpha must be of complex type as well (it may have zero +/// imaginary part, no problem), +/// - components of x may be of real type or complex type, and +/// - components of y may be of real type or complex type. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr2(const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + syr2( + space, trans, uplo, alpha, x, y, A); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp new file mode 100644 index 0000000000..59fb154d35 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp @@ -0,0 +1,205 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr2_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side BLAS (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, + Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) + +#endif +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp new file mode 100644 index 0000000000..66ba81b685 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp @@ -0,0 +1,35 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_HPP_ + +// BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +#include +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include +#endif + +#endif diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp new file mode 100644 index 0000000000..8561675c72 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp @@ -0,0 +1,317 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_BLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_BLAS_HPP_ + +#include "KokkosBlas_Host_tpl.hpp" + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + +#define KOKKOSBLAS2_DSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ + one, A.data(), LDA); \ + } else { \ + /* blasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ + one, A.data(), LDA); \ + } else { \ + /* blasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::zher2( \ + uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::cher2( \ + uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) +#endif + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp new file mode 100644 index 0000000000..ca98fedf0d --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp @@ -0,0 +1,372 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_CUBLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_CUBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? CUBLAS_FILL_MODE_LOWER \ + : CUBLAS_FILL_MODE_UPPER; + +#define KOKKOSBLAS2_DSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp new file mode 100644 index 0000000000..e6dfef7c6d --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -0,0 +1,336 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_ROCBLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_ROCBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower \ + : rocblas_fill_upper; + +#define KOKKOSBLAS2_DSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index b85f6109e8..a7be0d31ab 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -272,6 +272,35 @@ void F77_BLAS_MANGLE(zher, ZHER)(const char*, int*, const double*, const std::complex*, int*, std::complex*, int*); +/// +/// Syr2 +/// +void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, int*, const float*, + const float*, const int*, const float*, int*, + float*, int*); +void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, int*, const double*, + const double*, const int*, const double*, + int*, double*, int*); +// Although there is a cgeru, there is no csyr2u +// Although there is a zgeru, there is no zsyr2u +// Although there is a cgerc, there is no csyr2c, but there is cher2 (see below) +// Although there is a zgerc, there is no zsyr2c, but there is zher2 (see below) + +/// +/// Her2 +/// + +void F77_BLAS_MANGLE(cher2, CHER2)(const char*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); + /// /// Trsv /// @@ -499,6 +528,12 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CHER F77_BLAS_MANGLE(cher, CHER) #define F77_FUNC_ZHER F77_BLAS_MANGLE(zher, ZHER) +#define F77_FUNC_SSYR2 F77_BLAS_MANGLE(ssyr2, SSYR2) +#define F77_FUNC_DSYR2 F77_BLAS_MANGLE(dsyr2, DSYR2) + +#define F77_FUNC_CHER2 F77_BLAS_MANGLE(cher2, CHER2) +#define F77_FUNC_ZHER2 F77_BLAS_MANGLE(zher2, ZHER2) + #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) #define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV) @@ -611,6 +646,12 @@ void HostBlas::syr(const char uplo, int n, const float alpha, F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> +void HostBlas::syr2(const char uplo, int n, const float alpha, + const float* x, int incx, const float* y, int incy, + float* a, int lda) { + F77_FUNC_SSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const float* a, int lda, /* */ float* b, int ldb) { @@ -735,6 +776,12 @@ void HostBlas::syr(const char uplo, int n, const double alpha, F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> +void HostBlas::syr2(const char uplo, int n, const double alpha, + const double* x, int incx, const double* y, + int incy, double* a, int lda) { + F77_FUNC_DSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const double* a, int lda, /* */ double* b, int ldb) { @@ -889,6 +936,15 @@ void HostBlas >::cher( (std::complex*)a, &lda); } template <> +void HostBlas >::cher2( + const char uplo, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, (std::complex*)a, + &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, int lda, @@ -1067,6 +1123,15 @@ void HostBlas >::zher( (std::complex*)a, &lda); } template <> +void HostBlas >::zher2( + const char uplo, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, + (std::complex*)a, &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 6f6c34dc25..3b0c7f366e 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -76,6 +76,9 @@ struct HostBlas { static void syr(const char uplo, int n, const T alpha, const T *x, int incx, T *a, int lda); + static void syr2(const char uplo, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + template static void cher(const char uplo, int n, const tAlpha alpha, const T *x, int incx, T *a, int lda); @@ -84,6 +87,12 @@ struct HostBlas { static void zher(const char uplo, int n, const tAlpha alpha, const T *x, int incx, T *a, int lda); + static void cher2(const char uplo, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + + static void zher2(const char uplo, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, /* */ T *b, int ldb); diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index 1f4f130e8b..1abd288b0f 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -63,6 +63,7 @@ #include "Test_Blas2_gemv.hpp" #include "Test_Blas2_ger.hpp" #include "Test_Blas2_syr.hpp" +#include "Test_Blas2_syr2.hpp" // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp new file mode 100644 index 0000000000..080c106b9f --- /dev/null +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -0,0 +1,1963 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operations: +// --> A += alpha * x * y^T + alpha * y * x^T, or +// --> A += alpha * x * y^H + conj(alpha) * y * x^H +// 01) Type of 'x' components: float, double, complex, ... +// 02) Type of 'y' components: float, double, complex, ... +// 03) Type of 'A' components: float, double, complex, ... +// 04) Execution space: serial, threads, OpenMP, Cuda, ... +// 05) Layout of 'x' +// 06) Layout of 'y' +// 07) Layout of 'A' +// 08) Dimension of 'A' +// 09) Options 'const' or 'non const' for x view, when calling syr2() +// 10) Options 'const' or 'non const' for y view, when calling syr2() +// 11) Usage of analytical results in the tests +// 12) Options 'T' or 'H' when calling syr2() +// 13) Options 'U' or 'L' when calling syr2() +// +// Choices (01)-(05) are selected in the routines TEST_F() at the +// very bottom of the file, when calling test_syr2<...>(). +// +// Choices (06)-(13) are selected in routine test_syr2<...>(), +// when calling the method test() of class Test::Syr2Tester<...>. +// +// The class Test::Syr2Tester<...> represents the "core" of the test +// logic, where all calculations, comparisons, and success/failure +// decisions are performed. +// +// A high level explanation of method Test::SyrTester<...>::test() +// is given by the 7 steps named "Step 1 of 7" to "Step 7 of 7" +// in the code below. +// ********************************************************************** + +#include +#include +#include +#include +#include + +namespace Test { + +template +class Syr2Tester { + public: + Syr2Tester(); + + ~Syr2Tester(); + + void test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults = false, + const bool useHermitianOption = false, + const bool useUpOption = false); + + private: + typedef Kokkos::View _ViewTypeX; + typedef Kokkos::View _ViewTypeY; + typedef Kokkos::View _ViewTypeA; + + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeY::HostMirror _HostViewTypeY; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef Kokkos::View + _ViewTypeExpected; + + typedef Kokkos::ArithTraits _KAT_A; + typedef typename _KAT_A::mag_type _AuxType; + + void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, _ViewTypeX& x, + _ViewTypeY& y, _ViewTypeA& A, + bool& expectedResultIsKnown); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + T shrinkAngleToZeroTwoPiRange(const T input); + + template + void callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation); + + template + void callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _ViewTypeExpected& h_A_syr2, const std::string& situation); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; + const _AuxType _absTol; + const _AuxType _relTol; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _useUpOption; + bool _kkSyr2ShouldThrowException; + bool _kkGerShouldThrowException; +}; + +template +Syr2Tester::Syr2Tester() + : _A_is_complex(std::is_same>::value || + std::is_same>::value), + _A_is_lr(std::is_same::value), + _A_is_ll(std::is_same::value), + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Device::execution_space>()) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + , + _vanillaUsesDifferentOrderOfOps(_A_is_lr) +#else + , + _vanillaUsesDifferentOrderOfOps(false) +#endif + , + // **************************************************************** + // Tolerances for double can be tighter than tolerances for float. + // + // In the case of calculations with float, a small amount of + // discrepancies between reference results and CUDA results are + // large enough to require 'relTol' to value 5.0e-3. The same + // calculations show no discrepancies for calculations with double. + // **************************************************************** + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _M(-1), + _N(-1), + _useAnalyticalResults(false), + _useHermitianOption(false), + _useUpOption(false), + _kkSyr2ShouldThrowException(false), + _kkGerShouldThrowException(false) { +} + +template +Syr2Tester::~Syr2Tester() { + // Nothing to do +} + +template +void Syr2Tester::test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults, + const bool useHermitianOption, + const bool useUpOption) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Entering Syr2Tester::test()... - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - " + << std::endl; + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " + << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol + << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults + << ", useHermitianOption = " << useHermitianOption + << ", useUpOption = " << useUpOption << std::endl; +#endif + // ******************************************************************** + // Step 1 of 7: declare main types and variables + // ******************************************************************** + _M = N; + _N = N; + _useAnalyticalResults = useAnalyticalResults; + _useHermitianOption = useHermitianOption; + _useUpOption = useUpOption; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + _kkSyr2ShouldThrowException = false; + + _kkGerShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + _kkGerShouldThrowException = !_A_is_ll; + } +#endif + + bool test_x(false); + bool test_cx(false); + if (nonConstConstCombinations == 0) { + test_x = true; + } else if (nonConstConstCombinations == 1) { + test_cx = true; + } else { + test_x = true; + test_cx = true; + } + + view_stride_adapter<_ViewTypeX, false> x("X", _M); + view_stride_adapter<_ViewTypeY, false> y("Y", _N); + view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); + + view_stride_adapter<_ViewTypeExpected, true> h_expected( + "expected A += alpha * x * x^{t,h}", _M, _N); + bool expectedResultIsKnown = false; + + using AlphaCoeffType = typename _ViewTypeA::non_const_value_type; + ScalarA alpha(Kokkos::ArithTraits::zero()); + + // ******************************************************************** + // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A + // ******************************************************************** + this->populateVariables(alpha, x.h_view, y.h_view, A.h_view, + h_expected.d_view, x.d_view, y.d_view, A.d_view, + expectedResultIsKnown); + + // ******************************************************************** + // Step 3 of 7: populate h_vanilla + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_vanilla( + "vanilla = A + alpha * x * x^{t,h}", _M, _N); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = " + << typeid(alpha).name() << std::endl; +#endif + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, + h_vanilla.d_view); + + // ******************************************************************** + // Step 4 of 7: use h_vanilla and h_expected as appropriate + // ******************************************************************** + if (expectedResultIsKnown) { + // ****************************************************************** + // Compare h_vanilla against h_expected + // ****************************************************************** + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, + h_expected.d_view); + } else { + // ****************************************************************** + // Copy h_vanilla to h_expected + // ****************************************************************** + Kokkos::deep_copy(h_expected.d_base, h_vanilla.d_base); + } + + // ******************************************************************** + // Step 5 of 7: test with 'non const x' + // ******************************************************************** + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A.d_base, A.d_base); + Kokkos::deep_copy(org_A.h_view, A.h_view); + + if (test_x) { + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, + A.d_view, A.h_view, + h_expected.d_view, "non const x"); + + if ((_useAnalyticalResults == false) && // Just to save run time + (_kkGerShouldThrowException == false)) { + this->callKkGerAndCompareKkSyr2AgainstIt(alpha, x.d_view, y.d_view, org_A, + A.h_view, "non const x"); + } + } + + // ******************************************************************** + // Step 6 of 7: test with const x + // ******************************************************************** + if (test_cx) { + Kokkos::deep_copy(A.d_base, org_A.d_base); + + this->callKkSyr2AndCompareAgainstExpected( + alpha, x.d_view_const, y.d_view_const, A.d_view, A.h_view, + h_expected.d_view, "const x"); + } + + // ******************************************************************** + // Step 7 of 7: tests with invalid values on the first input parameter + // ******************************************************************** + EXPECT_ANY_THROW( + KokkosBlas::syr2(".", "U", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("", "U", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for mode ''"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("T", ".", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for uplo '.'"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("T", "", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for uplo ''"; + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Leaving Syr2Tester::test() - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - " + << std::endl; +#endif +} + +template +void Syr2Tester::populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, + _ViewTypeX& x, _ViewTypeY& y, + _ViewTypeA& A, + bool& expectedResultIsKnown) { + expectedResultIsKnown = false; + + if (_useAnalyticalResults) { + this->populateAnalyticalValues(alpha, h_x, h_y, h_A, h_expected); + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + expectedResultIsKnown = true; + } else if (_N == 1) { + alpha = 3; + + h_x[0] = 2; + + h_y[0] = 4; + + h_A(0, 0) = 7; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0, 0) = 55; + expectedResultIsKnown = true; + } else if (_N == 2) { + alpha = 3; + + h_x[0] = -2; + h_x[1] = 9; + + h_y[0] = 5; + h_y[1] = -4; + + h_A(0, 0) = 17; + h_A(0, 1) = -43; + h_A(1, 0) = -43; + h_A(1, 1) = 101; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + if (_useUpOption) { + h_expected(0, 0) = -43; + h_expected(0, 1) = 116; + h_expected(1, 0) = -43; + h_expected(1, 1) = -115; + } else { + h_expected(0, 0) = -43; + h_expected(0, 1) = -43; + h_expected(1, 0) = 116; + h_expected(1, 1) = -115; + } + expectedResultIsKnown = true; + } else { + alpha = 3; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + } + + { + ScalarY randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(y, rand_pool, randStart, randEnd); + } + + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(h_A, A); + + if (_useHermitianOption && _A_is_complex) { + // **************************************************************** + // Make h_A Hermitian + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = _KAT_A::conj(h_A(j, i)); + } + } + + for (int i(0); i < _N; ++i) { + h_A(i, i) = 0.5 * (h_A(i, i) + _KAT_A::conj(h_A(i, i))); + } + } else { + // **************************************************************** + // Make h_A symmetric + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = h_A(j, i); + } + } + } + Kokkos::deep_copy(A, h_A); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + alpha.real() = 1.4; + alpha.imag() = -2.3; + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i].real() = sin(auxI); + h_x[i].imag() = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_y[i].real() = cos(auxI); + h_y[i].imag() = cos(auxI); + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = -sin(auxImJ); + } else { + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = sin(auxImJ); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = sin(auxIpJ); + } + } + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j).real() = 3.8 * sin(auxIpJ); + h_expected(i, j).imag() = -5.6 * sin(auxImJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j).real() = 5.6 * sin(auxIpJ); + h_expected(i, j).imag() = 3.8 * sin(auxIpJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + alpha = 1.1; + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i] = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_y[i] = cos(auxI); + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j) = .1 * sin(auxIpJ); + } + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j) = 1.2 * sin(auxIpJ); + } else { + h_expected(i, j) = h_A(i, j); + } + } + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_vanillaUsesDifferentOrderOfOps) { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i) + + _KAT_A::conj(alpha) * _KAT_A::conj(h_x(j)) * h_y(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } else { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_useHermitianOption) { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * _KAT_A::conj(h_y(i)) + + _KAT_A::conj(alpha) * h_y(j) * _KAT_A::conj(h_x(i)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } else { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } +} + +template +template +T Syr2Tester::shrinkAngleToZeroTwoPiRange(const T input) { + T output(input); +#if 0 + T twoPi( 2. * Kokkos::numbers::pi ); + if (input > 0.) { + output -= std::floor( input / twoPi ) * twoPi; + } + else if (input < 0.) { + output += std::floor( -input / twoPi ) * twoPi; + } +#endif + return output; +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + errorHappened = false; + if (h_expected(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - " + "h_vanilla(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + errorHappened = false; + if (h_expected(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - " + "h_vanilla(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_vanilla(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_vanilla(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + } else { + int numErrorsReal(0); + int numErrorsImag(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j).real() != h_vanilla(i, j).real()) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " + << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << std::endl; + } +#endif + numErrorsReal++; + } + + if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " + << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << std::endl; + } +#endif + numErrorsImag++; + } + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); + errorHappened = false; + if (h_expected(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_vanilla(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } else { + int numErrors(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j) != h_vanilla(i, j)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; + } +#endif + numErrors++; + } + } // for j + } // for i + EXPECT_EQ(numErrors, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester:: + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j).real() - h_A(i, j).real()); + errorHappened = false; + if (h_reference(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); + errorHappened = false; + if (h_reference(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + if ((_M == 2131) && (_N == 2131)) { + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ", " << h_reference(11, 2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " + << h_A(11, 2119).imag() << ")" << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ", " << h_reference(710, 1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " + << h_A(710, 1065).imag() << ")" << std::endl; + } +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr2 result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr2 result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester:: + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j) - h_A(i, j)); + errorHappened = false; + if (h_reference(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) + << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +template +template +void Syr2Tester:: + callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha + << std::endl; + std::cout << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::syr2()" + << ": ViewTypeA = " << typeid(_ViewTypeA).name() + << ", _kkSyr2ShouldThrowException = " << _kkSyr2ShouldThrowException + << std::endl; +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + std::string uplo = _useUpOption ? "U" : "L"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::syr2(mode.c_str(), uplo.c_str(), alpha, x, y, A); + Kokkos::fence(); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "': caught exception, e.what() = " << e.what() << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "': caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened"; + + EXPECT_EQ(gotStdException, _kkSyr2ShouldThrowException) + << "Failed test, '" << situation << "': kk syr2() should" + << (_kkSyr2ShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; + + if ((gotStdException == false) && (gotUnknownException == false)) { + Kokkos::deep_copy(h_A, A); + this->compareKkSyr2AgainstReference(alpha, h_A, h_expected); + } +} + +template +template +void Syr2Tester:: + callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _ViewTypeExpected& h_A_syr2, const std::string& situation) { + view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); + Kokkos::deep_copy(A_ger.d_base, org_A.d_base); + + // ******************************************************************** + // Call ger() + // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha + << std::endl; + std::cout << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::ger()" + << ": ViewTypeA = " << typeid(_ViewTypeA).name() + << ", _kkGerShouldThrowException = " << _kkGerShouldThrowException + << std::endl; +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::ger(mode.c_str(), alpha, x, y, A_ger.d_view); + Kokkos::fence(); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 1: caught exception, e.what() = " << e.what() + << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 1: caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call 1"; + + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() 1 should not have thrown a std::exception"; + + // ******************************************************************** + // Call ger() again + // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::ger() again"; +#endif + try { + if (_useHermitianOption) { + KokkosBlas::ger(mode.c_str(), _KAT_A::conj(alpha), y, x, A_ger.d_view); + } else { + KokkosBlas::ger(mode.c_str(), alpha, y, x, A_ger.d_view); + } + Kokkos::fence(); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 2: caught exception, e.what() = " << e.what() + << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 2: caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call 2"; + + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() 2 should not have thrown a std::exception"; + + // ******************************************************************** + // Prepare h_ger_reference to be compared against h_A_syr2 + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference( + "h_ger_reference", _M, _N); + Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); + + std::string uplo = _useUpOption ? "U" : "L"; + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + // Keep h_ger_reference as already computed + } else { + h_ger_reference.d_view(i, j) = org_A.h_view(i, j); + } + } + } + if (_useHermitianOption && _A_is_complex) { + for (int i(0); i < _N; ++i) { + h_ger_reference.d_view(i, i) = + 0.5 * (h_ger_reference.d_view(i, i) + + _KAT_A::conj(h_ger_reference.d_view(i, i))); + } + } + + // ******************************************************************** + // Compare + // ******************************************************************** + this->compareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.d_view); +} + +} // namespace Test + +template +#ifdef HAVE_KOKKOSKERNELS_DEBUG +int test_syr2(const std::string& caseName) { + std::cout << "+==============================================================" + "============" + << std::endl; + std::cout << "Starting " << caseName << "..." << std::endl; +#else +int test_syr2(const std::string& /*caseName*/) { +#endif + bool xBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool yBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool aBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool useAnalyticalResults = xBool && yBool && aBool; + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for LAYOUTLEFT ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for LAYOUTLEFT" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for LAYOUTRIGHT ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for LAYOUTRIGHT" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for LAYOUTSTRIDE ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for LAYOUTSTRIDE" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for MIXED LAYOUTS ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(1, 0); + tester.test(2, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + } + + if (true) { + Test::Syr2Tester + tester; + tester.test(1024, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for MIXED LAYOUTS" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << std::endl; + std::cout << "+==============================================================" + "============" + << std::endl; +#endif + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_float"); + test_syr2("test case syr2_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_complex_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_float"); + test_syr2, Kokkos::complex, + Kokkos::complex, TestDevice>("test case syr2_complex_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_double"); + test_syr2("test case syr2_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_double"); + test_syr2, Kokkos::complex, + Kokkos::complex, TestDevice>( + "test case syr2_complex_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int"); + test_syr2("test case syr2_int"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, syr2_int_float_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int_float_double"); + test_syr2("test case syr2_mixed_types"); + Kokkos::Profiling::popRegion(); +} +#endif