Skip to content

Commit

Permalink
Add KOKKOSKERNELS_FORCE_SIMD macro (Fix #1040)
Browse files Browse the repository at this point in the history
to attempt to force vectorization of a standard for loop, using the
pragmas that are available. It won't use "#pragma omp simd" together with
either "#pragma vector always" or "#pragma ivdep", because in the OneAPI
compilers (dpcpp and icpx) those can't be used together on the same loop.
  • Loading branch information
brian-kelley committed Feb 3, 2022
1 parent 4b9e29b commit ddf68f4
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 153 deletions.
20 changes: 20 additions & 0 deletions src/KokkosKernels_Macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,26 @@
#endif
#endif

// Macro to place before an ordinary loop to force vectorization, based
// on the pragmas that are supported by the compiler. "Force" means to
// override the compiler's heuristics and always vectorize.
// This respects the fact that "omp simd" is incompatible with
// "vector always" and "ivdep" in the Intel OneAPI toolchain.
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#define KOKKOSKERNELS_FORCE_SIMD _Pragma("omp simd")
#else
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#define KOKKOSKERNELS_FORCE_SIMD _Pragma("ivdep") _Pragma("vector always")
#elif defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#define KOKKOSKERNELS_FORCE_SIMD _Pragma("ivdep")
#elif defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#define KOKKOSKERNELS_FORCE_SIMD _Pragma("vector always")
#else
// No macros available to suggest vectorization
#define KOKKOSKERNELS_FORCE_SIMD
#endif
#endif

// Macro that tells GCC not to worry if a variable isn't being used.
// Generalized attributes were not implemented in GCC until 4.8:
//
Expand Down
121 changes: 13 additions & 108 deletions src/batched/dense/KokkosBatched_Vector_SIMD.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <Kokkos_Complex.hpp>
#include <KokkosBatched_Vector.hpp>
#include "KokkosKernels_Macros.hpp"

#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
#undef __KOKKOSBATCHED_ENABLE_AVX__
Expand Down Expand Up @@ -38,56 +39,24 @@ class Vector<SIMD<T>, l> {
public:
KOKKOS_INLINE_FUNCTION Vector() {
// NOTE Not meant to be instantiated for CUDA
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) _data[i] = 0;
}
template <typename ArgValueType>
KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) _data[i] = val;
}
template <typename ArgValueType>
KOKKOS_INLINE_FUNCTION Vector(
const Vector<SIMD<ArgValueType>, vector_length> &b) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) _data[i] = b[i];
}

KOKKOS_INLINE_FUNCTION
type &loadAligned(const value_type *p) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) _data[i] = p[i];
return *this;
}
Expand All @@ -97,15 +66,7 @@ class Vector<SIMD<T>, l> {

KOKKOS_INLINE_FUNCTION
void storeAligned(value_type *p) const {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) p[i] = _data[i];
}

Expand Down Expand Up @@ -568,31 +529,15 @@ class Vector<SIMD<double>, 4> {
template <typename ArgValueType>
inline Vector(const ArgValueType &val) {
auto d = reinterpret_cast<value_type *>(&_data);
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) d[i] = val;
}

template <typename ArgValueType>
inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
auto dd = reinterpret_cast<value_type *>(&_data);
auto bb = reinterpret_cast<ArgValueType *>(&b._data);
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
}

Expand Down Expand Up @@ -669,15 +614,7 @@ class Vector<SIMD<Kokkos::complex<double> >, 2> {
inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
auto dd = reinterpret_cast<value_type *>(&_data);
auto bb = reinterpret_cast<ArgValueType *>(&b._data);
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
}

Expand Down Expand Up @@ -745,30 +682,14 @@ class Vector<SIMD<double>, 8> {
template <typename ArgValueType>
inline Vector(const ArgValueType &val) {
auto d = reinterpret_cast<value_type *>(&_data);
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) d[i] = val;
}
template <typename ArgValueType>
inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
auto dd = reinterpret_cast<value_type *>(&_data);
auto bb = reinterpret_cast<ArgValueType *>(&b._data);
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
}

Expand Down Expand Up @@ -834,30 +755,14 @@ class Vector<SIMD<Kokkos::complex<double> >, 4> {
template <typename ArgValueType>
inline Vector(const ArgValueType &val) {
auto d = reinterpret_cast<value_type *>(&_data);
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) d[i] = val;
}
template <typename ArgValueType>
inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
auto dd = reinterpret_cast<value_type *>(&_data);
auto bb = reinterpret_cast<value_type *>(&b._data);
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
}

Expand Down
50 changes: 5 additions & 45 deletions src/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,7 @@ KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
operator+(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
Vector<SIMD<T>, l> r_val;
if (std::is_fundamental<T>::value) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < l; ++i) r_val[i] = a[i] + b[i];
} else {
for (int i = 0; i < l; ++i) r_val[i] = a[i] + b[i];
Expand Down Expand Up @@ -254,15 +246,7 @@ KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
operator-(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
Vector<SIMD<T>, l> r_val;
if (std::is_fundamental<T>::value) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < l; ++i) r_val[i] = a[i] - b[i];
} else {
for (int i = 0; i < l; ++i) r_val[i] = a[i] - b[i];
Expand Down Expand Up @@ -314,15 +298,7 @@ KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
operator-(const Vector<SIMD<T>, l> &a) {
Vector<SIMD<T>, l> r_val;
if (std::is_fundamental<T>::value) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < l; ++i) r_val[i] = -a[i];
} else {
for (int i = 0; i < l; ++i) r_val[i] = -a[i];
Expand Down Expand Up @@ -499,15 +475,7 @@ KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
operator*(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
Vector<SIMD<T>, l> r_val;
if (std::is_fundamental<T>::value) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < l; ++i) r_val[i] = a[i] * b[i];
} else {
for (int i = 0; i < l; ++i) r_val[i] = a[i] * b[i];
Expand Down Expand Up @@ -772,15 +740,7 @@ KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
operator/(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
Vector<SIMD<T>, l> r_val;
if (std::is_fundamental<T>::value) {
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
#pragma vector always
#endif
#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
#pragma omp simd
#endif
KOKKOSKERNELS_FORCE_SIMD
for (int i = 0; i < l; ++i) r_val[i] = a[i] / b[i];
} else {
for (int i = 0; i < l; ++i) r_val[i] = a[i] / b[i];
Expand Down

0 comments on commit ddf68f4

Please sign in to comment.