Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in the construct_at optimization #608

Merged
merged 2 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
#include "test_macros.h"
#include "test_iterators.h"

#if defined(TEST_COMPILER_MSVC)
#pragma warning(disable:4244)
#endif // TEST_COMPILER_MSVC

struct Foo {
__host__ __device__ constexpr Foo() { }
__host__ __device__ constexpr Foo(int a, char b, double c) : a_(a), b_(b), c_(c) { }
Expand Down Expand Up @@ -121,6 +125,14 @@ __host__ __device__ constexpr bool test()
assert(res == &with_special_move_assignment);
}

// ensure that we can construct despite narrowing conversions
{
int i = 0;
int* res = cuda::std::construct_at(&i, 2.0);
assert(res == &i);
assert(*res == 2);
}

#if 0 // we do not support std::allocator
{
cuda::std::allocator<Counted> a;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,107 +12,154 @@
#define _LIBCUDACXX___MEMORY_CONSTRUCT_AT_H

#ifndef __cuda_std__
#include <__config>
# include <__config>
#endif //__cuda_std__

#include "../__assert"
#include "../__concepts/__concept_macros.h"
#include "../__iterator/access.h"
#include "../__memory/addressof.h"
#include "../__memory/voidify.h"
#include "../__type_traits/enable_if.h"
#include "../__type_traits/integral_constant.h"
#include "../__type_traits/is_arithmetic.h"
#include "../__type_traits/is_array.h"
#include "../__type_traits/is_constant_evaluated.h"
#include "../__type_traits/is_trivially_move_assignable.h"
#include "../__type_traits/is_trivially_constructible.h"
#include "../__type_traits/void_t.h"
#include "../__utility/declval.h"
#include "../__utility/forward.h"
#include "../__utility/move.h"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
# pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#ifdef _LIBCUDACXX_COMPILER_CLANG_CUDA
#include <new>
# include <new>
#endif // _LIBCUDACXX_COMPILER_CLANG_CUDA

#if defined(__cuda_std__) && _LIBCUDACXX_STD_VER > 17 // need to backfill ::std::construct_at
#ifndef _LIBCUDACXX_COMPILER_NVRTC
#include <memory>
#endif // _LIBCUDACXX_COMPILER_NVRTC

#ifndef __cpp_lib_constexpr_dynamic_alloc
namespace std {
template <class _Tp, class... _Args, class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* construct_at(_Tp* __location, _Args&&... __args) {
#if defined(_LIBCUDACXX_ADDRESSOF)
# ifndef _LIBCUDACXX_COMPILER_NVRTC
# include <memory>
# endif // _LIBCUDACXX_COMPILER_NVRTC

# ifndef __cpp_lib_constexpr_dynamic_alloc
namespace std
{
template <class _Tp,
class... _Args,
class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* construct_at(_Tp* __location, _Args&&... __args)
{
# if defined(_LIBCUDACXX_ADDRESSOF)
return ::new (_CUDA_VSTD::__voidify(*__location)) _Tp(_CUDA_VSTD::forward<_Args>(__args)...);
#else
return ::new (const_cast<void*>(static_cast<const volatile void*>(__location))) _Tp(_CUDA_VSTD::forward<_Args>(__args)...);
#endif
# else
return ::new (const_cast<void*>(static_cast<const volatile void*>(__location)))
_Tp(_CUDA_VSTD::forward<_Args>(__args)...);
# endif
}
} // namespace std
#endif // __cpp_lib_constexpr_dynamic_alloc
# endif // __cpp_lib_constexpr_dynamic_alloc
#endif // __cuda_std__ && _LIBCUDACXX_STD_VER > 17

_LIBCUDACXX_BEGIN_NAMESPACE_STD

// There is a performance issue with placement new, where EDG based compiler insert a nullptr check that is superfluous
// Because this is a noticable performance regression, we specialize for trivially constructible types
// Because this is a noticable performance regression, we specialize it for certain types
// This is possible because we are calling ::new ignoring any user defined overloads of operator placement new
namespace __detail
{
// We cannot allow narrowing conversions between arithmetic types as the assignment will give errors
template <class _To, class...>
struct __is_narrowing_impl : false_type
{};

template <class _To, class _From>
struct __is_narrowing_impl<_To, _From> : true_type
{};

// This is a bit hacky, but we rely on the fact that arithmetic types cannot have more than one argument to their constructor
template <class _To, class _From>
struct __is_narrowing_impl<_To, _From, __void_t<decltype(_To{_CUDA_VSTD::declval<_From>()})>> : false_type
{};

template <class _Tp, class... _Args>
using __is_narrowing = _If<_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), __is_narrowing_impl<_Tp, _Args...>, false_type>;

// The destination type must be trivially constructible from the arguments and also trivially assignable, because we
// technically move assign in the optimization
template <class _Tp, class... _Args>
struct __can_optimize_construct_at
: integral_constant<bool,
_LIBCUDACXX_TRAIT(is_trivially_constructible, _Tp, _Args...)
&& _LIBCUDACXX_TRAIT(is_trivially_move_assignable, _Tp)
&& !__is_narrowing<_Tp, _Args...>::value>
{};
} // namespace __detail

// construct_at
#if _LIBCUDACXX_STD_VER > 17

_LIBCUDACXX_DISABLE_EXEC_CHECK
template <class _Tp, class... _Args, class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
__enable_if_t<!is_trivially_constructible_v<_Tp, _Args...> ||
!is_trivially_move_assignable_v<_Tp>, _Tp*>
construct_at(_Tp* __location, _Args&&... __args) {
template <class _Tp,
class... _Args,
class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
_LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_CONSTEXPR_AFTER_CXX17 __enable_if_t<!__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
construct_at(_Tp* __location, _Args&&... __args)
{
_LIBCUDACXX_ASSERT(__location != nullptr, "null pointer given to construct_at");
#if defined(__cuda_std__)
# if defined(__cuda_std__)
// Need to go through `std::construct_at` as that is the explicitly blessed function
if (__libcpp_is_constant_evaluated()) {
if (__libcpp_is_constant_evaluated())
{
return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
}
#endif // __cuda_std__
# endif // __cuda_std__
return ::new (_CUDA_VSTD::__voidify(*__location)) _Tp(_CUDA_VSTD::forward<_Args>(__args)...);
}

_LIBCUDACXX_DISABLE_EXEC_CHECK
template <class _Tp, class... _Args, class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
__enable_if_t<is_trivially_constructible_v<_Tp, _Args...> &&
is_trivially_move_assignable_v<_Tp>, _Tp*>
construct_at(_Tp* __location, _Args&&... __args) {
template <class _Tp,
class... _Args,
class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
_LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_CONSTEXPR_AFTER_CXX17 __enable_if_t<__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
construct_at(_Tp* __location, _Args&&... __args)
{
_LIBCUDACXX_ASSERT(__location != nullptr, "null pointer given to construct_at");
#if defined(__cuda_std__)
# if defined(__cuda_std__)
// Need to go through `std::construct_at` as that is the explicitly blessed function
if (__libcpp_is_constant_evaluated()) {
if (__libcpp_is_constant_evaluated())
{
return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
}
*__location = _Tp{_CUDA_VSTD::forward<_Args>(__args)...};
return __location;
#else // ^^^ __cuda_std__ ^^^ / vvv !__cuda_std__ vvv
# else // ^^^ __cuda_std__ ^^^ / vvv !__cuda_std__ vvv
// NVCC always considers construction + move assignment, other compilers are smarter using copy construction
// So rather than adding all kinds of workarounds simply fall back to the correct implementation for libcxx mode
return ::new (_CUDA_VSTD::__voidify(*__location)) _Tp(_CUDA_VSTD::forward<_Args>(__args)...);
#endif // !__cuda_std__
# endif // !__cuda_std__
}

#endif // _LIBCUDACXX_STD_VER > 17

_LIBCUDACXX_DISABLE_EXEC_CHECK
template <class _Tp, class... _Args>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
__enable_if_t<!_LIBCUDACXX_TRAIT(is_trivially_constructible, _Tp, _Args...) || !_LIBCUDACXX_TRAIT(is_trivially_move_assignable, _Tp), _Tp*>
__construct_at(_Tp* __location, _Args&&... __args) {
_LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_CONSTEXPR_AFTER_CXX17 __enable_if_t<!__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
__construct_at(_Tp* __location, _Args&&... __args)
{
_LIBCUDACXX_ASSERT(__location != nullptr, "null pointer given to construct_at");
#if defined(__cuda_std__) && _LIBCUDACXX_STD_VER > 17
// Need to go through `std::construct_at` as that is the explicitly blessed function
if (__libcpp_is_constant_evaluated()) {
if (__libcpp_is_constant_evaluated())
{
return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
}
#endif // __cuda_std__ && _LIBCUDACXX_STD_VER > 17
Expand All @@ -121,13 +168,15 @@ __construct_at(_Tp* __location, _Args&&... __args) {

_LIBCUDACXX_DISABLE_EXEC_CHECK
template <class _Tp, class... _Args>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
__enable_if_t<_LIBCUDACXX_TRAIT(is_trivially_constructible, _Tp, _Args...) && _LIBCUDACXX_TRAIT(is_trivially_move_assignable, _Tp), _Tp*>
__construct_at(_Tp* __location, _Args&&... __args) {
_LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_CONSTEXPR_AFTER_CXX17 __enable_if_t<__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
__construct_at(_Tp* __location, _Args&&... __args)
{
_LIBCUDACXX_ASSERT(__location != nullptr, "null pointer given to construct_at");
#if defined(__cuda_std__) && _LIBCUDACXX_STD_VER > 17
// Need to go through `std::construct_at` as that is the explicitly blessed function
if (__libcpp_is_constant_evaluated()) {
if (__libcpp_is_constant_evaluated())
{
return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
}
#endif // __cuda_std__ && _LIBCUDACXX_STD_VER > 17
Expand All @@ -140,72 +189,81 @@ __construct_at(_Tp* __location, _Args&&... __args) {
// The internal functions are available regardless of the language version (with the exception of the `__destroy_at`
// taking an array).
template <class _ForwardIterator>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
_ForwardIterator __destroy(_ForwardIterator, _ForwardIterator);
_LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_CONSTEXPR_AFTER_CXX17 _ForwardIterator __destroy(_ForwardIterator, _ForwardIterator);

template <class _Tp, __enable_if_t<!is_array<_Tp>::value, int> = 0>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
void __destroy_at(_Tp* __loc) {
_LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
__loc->~_Tp();
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 void __destroy_at(_Tp* __loc)
{
_LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
__loc->~_Tp();
}

#if _LIBCUDACXX_STD_VER > 17
template <class _Tp, __enable_if_t<is_array<_Tp>::value, int> = 0>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
void __destroy_at(_Tp* __loc) {
_LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
_CUDA_VSTD::__destroy(_CUDA_VSTD::begin(*__loc), _CUDA_VSTD::end(*__loc));
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 void __destroy_at(_Tp* __loc)
{
_LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
_CUDA_VSTD::__destroy(_CUDA_VSTD::begin(*__loc), _CUDA_VSTD::end(*__loc));
}
#endif

template <class _ForwardIterator>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
_ForwardIterator __destroy(_ForwardIterator __first, _ForwardIterator __last) {
for (; __first != __last; ++__first)
_CUDA_VSTD::__destroy_at(_CUDA_VSTD::addressof(*__first));
return __first;
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 _ForwardIterator
__destroy(_ForwardIterator __first, _ForwardIterator __last)
{
for (; __first != __last; ++__first)
{
_CUDA_VSTD::__destroy_at(_CUDA_VSTD::addressof(*__first));
}
return __first;
}

template <class _BidirectionalIterator>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
_BidirectionalIterator __reverse_destroy(_BidirectionalIterator __first, _BidirectionalIterator __last) {
while (__last != __first) {
--__last;
_CUDA_VSTD::__destroy_at(_CUDA_VSTD::addressof(*__last));
}
return __last;
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 _BidirectionalIterator
__reverse_destroy(_BidirectionalIterator __first, _BidirectionalIterator __last)
{
while (__last != __first)
{
--__last;
_CUDA_VSTD::__destroy_at(_CUDA_VSTD::addressof(*__last));
}
return __last;
}

#if _LIBCUDACXX_STD_VER > 14

template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
void destroy_at(_Tp* __loc) {
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 void destroy_at(_Tp* __loc)
{
_LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
__loc->~_Tp();
}

#if _LIBCUDACXX_STD_VER > 17
# if _LIBCUDACXX_STD_VER > 17
template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
void destroy_at(_Tp* __loc) {
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 void destroy_at(_Tp* __loc)
{
_CUDA_VSTD::__destroy_at(__loc);
}
#endif // _LIBCUDACXX_STD_VER > 17
# endif // _LIBCUDACXX_STD_VER > 17

template <class _ForwardIterator>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
void destroy(_ForwardIterator __first, _ForwardIterator __last) {
(void)_CUDA_VSTD::__destroy(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last));
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 void
destroy(_ForwardIterator __first, _ForwardIterator __last)
{
(void) _CUDA_VSTD::__destroy(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last));
}

template <class _ForwardIterator, class _Size>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17
_ForwardIterator destroy_n(_ForwardIterator __first, _Size __n) {
for (; __n > 0; (void)++__first, --__n)
_CUDA_VSTD::__destroy_at(_CUDA_VSTD::addressof(*__first));
return __first;
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 _ForwardIterator
destroy_n(_ForwardIterator __first, _Size __n)
{
for (; __n > 0; (void) ++__first, --__n)
{
_CUDA_VSTD::__destroy_at(_CUDA_VSTD::addressof(*__first));
}
return __first;
}

#endif // _LIBCUDACXX_STD_VER > 14
Expand Down
Loading