Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix performance for CUDA >= 9.2 (master) #1327

Merged
merged 22 commits into from
Jul 8, 2019
9 changes: 5 additions & 4 deletions include/gridtools/common/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "../meta/macros.hpp"
#include "../meta/repeat.hpp"
#include "defs.hpp"
#include "generic_metafunctions/const_ref.hpp"
#include "generic_metafunctions/utility.hpp"
#include "gt_assert.hpp"
#include "host_device.hpp"
Expand Down Expand Up @@ -117,13 +118,13 @@ namespace gridtools {
}

template <size_t I, typename T, size_t D>
static GT_FUNCTION GT_CONSTEXPR const T &get(const array<T, D> &arr) noexcept {
static GT_FUNCTION GT_CONSTEXPR const_ref<T> get(const array<T, D> &arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return arr.m_array[I];
}

template <size_t I, typename T, size_t D>
static GT_FUNCTION GT_CONSTEXPR T &&get(array<T, D> &&arr) noexcept {
static GT_FUNCTION GT_CONSTEXPR T get(array<T, D> &&arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return wstd::move(arr.m_array[I]);
}
Expand Down Expand Up @@ -187,13 +188,13 @@ namespace gridtools {
}

template <size_t I, typename T, size_t D>
GT_FUNCTION GT_CONSTEXPR const T &get(const array<T, D> &arr) noexcept {
GT_FUNCTION GT_CONSTEXPR const_ref<T> get(const array<T, D> &arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return arr.m_array[I];
}

template <size_t I, typename T, size_t D>
GT_FUNCTION GT_CONSTEXPR T &&get(array<T, D> &&arr) noexcept {
GT_FUNCTION GT_CONSTEXPR T get(array<T, D> &&arr) noexcept {
GT_STATIC_ASSERT(I < D, "index is out of bounds");
return wstd::move(get<I>(arr));
}
Expand Down
4 changes: 0 additions & 4 deletions include/gridtools/common/defs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@
@brief global definitions
*/

#ifdef __CUDA_ARCH__
#define GT_CONSTEXPR
#else
#define GT_CONSTEXPR constexpr
#endif

#define GT_RESTRICT __restrict__

Expand Down
31 changes: 31 additions & 0 deletions include/gridtools/common/generic_metafunctions/const_ref.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* GridTools
*
* Copyright (c) 2014-2019, ETH Zurich
* All rights reserved.
*
* Please, refer to the LICENSE file in the root directory.
* SPDX-License-Identifier: BSD-3-Clause
*/

#pragma once

#include <type_traits>

#include "../../meta/macros.hpp"
#include "../../meta/type_traits.hpp"

namespace gridtools {
namespace lazy {
template <class T, class = void>
struct const_ref : std::add_lvalue_reference<std::add_const_t<T>> {};

template <class T>
struct const_ref<T,
std::enable_if_t<!std::is_reference<T>::value && std::is_trivially_copy_constructible<T>::value &&
sizeof(T) <= sizeof(std::add_pointer_t<T>)>> : std::add_const<T> {};
} // namespace lazy

template <class T>
using const_ref = typename lazy::const_ref<T>::type;
} // namespace gridtools
19 changes: 7 additions & 12 deletions include/gridtools/common/gt_assert.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* SPDX-License-Identifier: BSD-3-Clause
*/
#pragma once
#include <cassert>
#include <stdexcept>

/** \ingroup common
Expand All @@ -16,19 +17,13 @@
@{
*/

#ifdef __CUDACC__
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
// we take the cuda assert for arch greater than 2.x
#include <assert.h>
#else
#undef assert
#define assert(e)
#endif
#else
#include <cassert>
#endif

#ifdef __CUDA_ARCH__
#if __CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ == 2
// we define this macro to an empty string for CUDA 9.2 because in certain cases, CUDA 9.2 tries to compile device
// instantiations of certain constexpr function templates, which can lead to compile-time errors like "cannot use an
// entity undefined in device code".
#define __PRETTY_FUNCTION__ ""
#endif
#define GT_ASSERT_OR_THROW(cond, msg) assert(cond)
#else
#define GT_ASSERT_OR_THROW(cond, msg) \
Expand Down
9 changes: 5 additions & 4 deletions include/gridtools/common/pair.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <utility>

#include "defs.hpp"
#include "generic_metafunctions/const_ref.hpp"
#include "generic_metafunctions/utility.hpp"
#include "host_device.hpp"

Expand Down Expand Up @@ -129,30 +130,30 @@ namespace gridtools {
template <>
struct pair_get<0> {
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION const T1 &const_get(const pair<T1, T2> &p) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T1> const_get(const pair<T1, T2> &p) noexcept {
return p.first;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T1 &get(pair<T1, T2> &p) noexcept {
return p.first;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T1 &&move_get(pair<T1, T2> &&p) noexcept {
static GT_CONSTEXPR GT_FUNCTION T1 move_get(pair<T1, T2> &&p) noexcept {
return wstd::move(p.first);
}
};
template <>
struct pair_get<1> {
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION const T2 &const_get(const pair<T1, T2> &p) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T2> const_get(const pair<T1, T2> &p) noexcept {
return p.second;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T2 &get(pair<T1, T2> &p) noexcept {
return p.second;
}
template <typename T1, typename T2>
static GT_CONSTEXPR GT_FUNCTION T2 &&move_get(pair<T1, T2> &&p) noexcept {
static GT_CONSTEXPR GT_FUNCTION T2 move_get(pair<T1, T2> &&p) noexcept {
return wstd::move(p.second);
}
};
Expand Down
17 changes: 9 additions & 8 deletions include/gridtools/common/tuple.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "../meta/type_traits.hpp"
#include "defs.hpp"
#include "generic_metafunctions/const_ref.hpp"
#include "generic_metafunctions/utility.hpp"
#include "host_device.hpp"

Expand Down Expand Up @@ -50,7 +51,7 @@ namespace gridtools {

struct tuple_leaf_getter {
template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T const &get(tuple_leaf<I, T, false> const &obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T> get(tuple_leaf<I, T, false> const &obj) noexcept {
return obj.m_value;
}

Expand All @@ -60,12 +61,12 @@ namespace gridtools {
}

template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T &&get(tuple_leaf<I, T, false> &&obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION T get(tuple_leaf<I, T, false> &&obj) noexcept {
return static_cast<T &&>(get<I>(obj));
}

template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T const &get(tuple_leaf<I, T, true> const &obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T> get(tuple_leaf<I, T, true> const &obj) noexcept {
return obj;
}

Expand All @@ -75,7 +76,7 @@ namespace gridtools {
}

template <size_t I, class T>
static GT_CONSTEXPR GT_FUNCTION T &&get(tuple_leaf<I, T, true> &&obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION T get(tuple_leaf<I, T, true> &&obj) noexcept {
return static_cast<T &&>(obj);
}
};
Expand Down Expand Up @@ -171,7 +172,7 @@ namespace gridtools {
tuple &operator=(tuple const &) = default;
tuple &operator=(tuple &&) = default;

GT_CONSTEXPR GT_FUNCTION tuple(Ts const &... args) noexcept : m_impl(args...) {}
GT_CONSTEXPR GT_FUNCTION tuple(const_ref<Ts>... args) noexcept : m_impl(args...) {}

template <class... Args,
std::enable_if_t<sizeof...(Ts) == sizeof...(Args) &&
Expand Down Expand Up @@ -205,7 +206,7 @@ namespace gridtools {
T m_value;
struct getter {
template <size_t I, std::enable_if_t<I == 0, int> = 0>
static GT_CONSTEXPR GT_FUNCTION T const &get(tuple const &obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION const_ref<T> get(tuple const &obj) noexcept {
return obj.m_value;
}

Expand All @@ -215,7 +216,7 @@ namespace gridtools {
}

template <size_t I, std::enable_if_t<I == 0, int> = 0>
static GT_CONSTEXPR GT_FUNCTION T &&get(tuple &&obj) noexcept {
static GT_CONSTEXPR GT_FUNCTION T get(tuple &&obj) noexcept {
return static_cast<T &&>(obj.m_value);
}
};
Expand All @@ -232,7 +233,7 @@ namespace gridtools {
tuple &operator=(tuple const &) = default;
tuple &operator=(tuple &&) = default;

GT_CONSTEXPR GT_FUNCTION tuple(T const &arg) noexcept : m_value(arg) {}
GT_CONSTEXPR GT_FUNCTION tuple(const_ref<T> arg) noexcept : m_value(arg) {}

template <class Arg, std::enable_if_t<std::is_constructible<T, Arg &&>::value, int> = 0>
GT_CONSTEXPR GT_FUNCTION tuple(Arg &&arg) noexcept : m_value(wstd::forward<Arg>(arg)) {}
Expand Down
32 changes: 15 additions & 17 deletions include/gridtools/common/tuple_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,7 @@ namespace gridtools {
enum class ref_kind { rvalue, lvalue, const_lvalue };

template <class>
struct get_ref_kind;

template <class T>
struct get_ref_kind<T &&> : std::integral_constant<ref_kind, ref_kind::rvalue> {};
struct get_ref_kind : std::integral_constant<ref_kind, ref_kind::rvalue> {};

template <class T>
struct get_ref_kind<T &> : std::integral_constant<ref_kind, ref_kind::lvalue> {};
Expand All @@ -269,7 +266,9 @@ namespace gridtools {
struct add_ref;

template <class T>
struct add_ref<ref_kind::rvalue, T> : std::add_rvalue_reference<T> {};
struct add_ref<ref_kind::rvalue, T> {
using type = T;
};

template <class T>
struct add_ref<ref_kind::lvalue, T> : std::add_lvalue_reference<T> {};
Expand Down Expand Up @@ -419,8 +418,7 @@ namespace gridtools {
template <class Tup,
class... Tups,
class Is = meta::make_indices<size<std::decay_t<Tup>>>,
class Res =
from_types<Tup, get_results_t<Is, get_accessors<Tup &&>, get_accessors<Tups &&>...>>>
class Res = from_types<Tup, get_results_t<Is, get_accessors<Tup>, get_accessors<Tups>...>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup, Tups &&... tups) const {
using generators_t = meta::transform<get_transform_index_generator, Is>;
return generate_f<generators_t, Res>{}(
Expand Down Expand Up @@ -517,7 +515,7 @@ namespace gridtools {
meta::make_indices_for<InnerTup>>;

template <class Tup,
class Accessors = meta::transform<get_accessors, get_accessors<Tup &&>>,
class Accessors = meta::transform<get_accessors, get_accessors<Tup>>,
class First = meta::first<to_types<Tup>>,
class Res = from_types<First, meta::flatten<Accessors>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
Expand All @@ -534,7 +532,7 @@ namespace gridtools {
using get_drop_front_generator = get_nth_f<N + I::value>;

template <class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::drop_front_c<N, Accessors>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
using generators =
Expand All @@ -558,7 +556,7 @@ namespace gridtools {
struct push_back_f {
template <class Tup,
class... Args,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::push_back<Accessors, Args &&...>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup, Args &&... args) const {
return push_back_impl_f<std::make_index_sequence<size<Accessors>::value>, Res>{}(
Expand All @@ -581,7 +579,7 @@ namespace gridtools {
struct push_front_f {
template <class Tup,
class... Args,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::push_front<Accessors, Args &&...>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup, Args &&... args) const {
return push_front_impl_f<std::make_index_sequence<size<Accessors>::value>, Res>{}(
Expand Down Expand Up @@ -634,7 +632,7 @@ namespace gridtools {
size_t N,
class State,
class Tup,
class AllAccessors = get_accessors<Tup &&>,
class AllAccessors = get_accessors<Tup>,
class Accessors = meta::drop_front_c<I, AllAccessors>,
class Res = meta::lfold<meta_fun, State &&, Accessors>,
std::enable_if_t<(I + 4 < N), int> = 0>
Expand All @@ -651,15 +649,15 @@ namespace gridtools {

template <class State,
class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = meta::lfold<meta_fun, State &&, Accessors>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(State &&state, Tup &&tup) const {
return impl<0, size<std::decay_t<Tup>>::value>(
wstd::forward<State>(state), wstd::forward<Tup>(tup));
}

template <class Tup,
class AllAccessors = get_accessors<Tup &&>,
class AllAccessors = get_accessors<Tup>,
class StateAccessor = meta::first<AllAccessors>,
class Accessors = meta::drop_front_c<1, AllAccessors>,
class Res = meta::lfold<meta_fun, StateAccessor, Accessors>>
Expand Down Expand Up @@ -753,7 +751,7 @@ namespace gridtools {

template <class Tup,
class First = meta::first<to_types<Tup>>,
class Accessors = meta::transform<get_accessors, get_accessors<Tup &&>>,
class Accessors = meta::transform<get_accessors, get_accessors<Tup>>,
class Types = meta::transpose<Accessors>,
class InnerTuples = meta::transform<get_inner_tuple_f<Tup>::template apply, Types>,
class Res = from_types<First, InnerTuples>>
Expand All @@ -774,7 +772,7 @@ namespace gridtools {
};

template <class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Res = from_types<Tup, meta::reverse<Accessors>>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
using n_t = size<std::decay_t<Tup>>;
Expand Down Expand Up @@ -813,7 +811,7 @@ namespace gridtools {
meta::if_c<I::value == N, insert_val_generator_f, insert_tup_generator_f<I::value - 1>>>;

template <class Tup,
class Accessors = get_accessors<Tup &&>,
class Accessors = get_accessors<Tup>,
class Types = meta::insert_c<N, Accessors, Val>,
class Res = from_types<Tup, Types>>
GT_TARGET GT_FORCE_INLINE GT_CONSTEXPR Res operator()(Tup &&tup) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,19 @@ namespace gridtools {
return arg;
}

// intel compiler 18.0 segfaults if this is a value. On the other hand, nvcc performs much worse in the
// dycore if it is a lvalue reference
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER <= 1800)
template <class Eval, class Arg, std::enable_if_t<!std::is_arithmetic<Arg>::value, int> = 0>
GT_FUNCTION GT_CONSTEXPR decltype(auto) apply_eval(Eval &eval, Arg const &arg) {
return eval(arg);
}
#else
template <class Eval, class Arg, std::enable_if_t<!std::is_arithmetic<Arg>::value, int> = 0>
GT_FUNCTION GT_CONSTEXPR decltype(auto) apply_eval(Eval &eval, Arg arg) {
return eval(wstd::move(arg));
}
#endif

template <class Eval, class Op, class Arg>
GT_FUNCTION GT_CONSTEXPR auto value(Eval &eval, expr<Op, Arg> const &arg) {
Expand Down
Loading