diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier index 25b14a7f18..50bfe664e8 100644 --- a/libcudacxx/include/cuda/barrier +++ b/libcudacxx/include/cuda/barrier @@ -50,7 +50,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL // capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is // necessary to prevent cudafe from ripping out the device functions before // device compilation begins. -#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) +#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk inline _LIBCUDACXX_DEVICE @@ -288,7 +288,7 @@ void cp_async_bulk_wait_group_read() : "memory"); } -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_lib_experimental_ctk12_cp_async_exposure _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h similarity index 57% rename from libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h rename to libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h index 5dcccc5d1f..54f833a814 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h @@ -1,19 +1,18 @@ -// -*- C++ -*- //===----------------------------------------------------------------------===// // // Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// +#ifndef __CCCL_PTX_ISA_H_ +#define __CCCL_PTX_ISA_H_ -#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ -#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ - -#include // __CUDA_MINIMUM_ARCH__ and friends +#include "../__cccl/compiler.h" +#include "../__cccl/system_header.h" #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header @@ -23,6 +22,8 @@ # pragma system_header #endif // no system header +#include // __CUDA_MINIMUM_ARCH__ and friends + /* * Targeting macros * @@ -31,47 +32,75 @@ */ // PTX ISA 8.3 is available from CUDA 12.3, driver r545 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +// The first define is for future major versions of CUDACC. +// We make sure that these get the highest known PTX ISA version. +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 830ULL +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 830ULL // PTX ISA 8.2 is available from CUDA 12.2, driver r535 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 820ULL // PTX ISA 8.1 is available from CUDA 12.1, driver r530 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 810ULL // PTX ISA 8.0 is available from CUDA 12.0, driver r525 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 800ULL // PTX ISA 7.8 is available from CUDA 11.8, driver r520 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 780ULL // PTX ISA 7.7 is available from CUDA 11.7, driver r515 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 770ULL // PTX ISA 7.6 is available from CUDA 11.6, driver r510 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 760ULL // PTX ISA 7.5 is available from CUDA 11.5, driver r495 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 750ULL // PTX ISA 7.4 is available from CUDA 11.4, driver r470 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 740ULL // PTX ISA 7.3 is available from CUDA 11.3, driver r465 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 730ULL // PTX ISA 7.2 is available from CUDA 11.2, driver r460 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 720ULL // PTX ISA 7.1 is available from CUDA 11.1, driver r455 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 710ULL // PTX ISA 7.0 is available from CUDA 11.0, driver r445 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 700ULL // Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined. #else # define __cccl_ptx_isa 0ULL #endif -#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ +// We define certain feature test macros depending on availability. When +// __CUDA_MINIMUM_ARCH__ is not available, we define the following features +// depending on PTX ISA. This permits checking for the feature in host code. +// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the +// hardware supports it. +#if (!defined(__CUDA_MINIMUM_ARCH__)) \ + || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_isa_ptx >= 800 +# define __cccl_lib_local_barrier_arrive_tx +# define __cccl_lib_experimental_ctk12_cp_async_exposure +#endif + +#endif // __CCCL_PTX_ISA_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config index bb3a5078dd..0e8612926f 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config @@ -15,6 +15,7 @@ #include "__cccl/diagnostic.h" #include "__cccl/dialect.h" #include "__cccl/execution_space.h" +#include "__cccl/ptx_isa.h" #include "__cccl/system_header.h" #include "__cccl/version.h" #include "__cccl/visibility.h" diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index 3a0bcd10ea..59b204c828 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -572,11 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier #if defined(_CCCL_CUDA_COMPILER) -// Hide arrive_tx when CUDA architecture is insufficient. Note the -// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function -// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__. -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) - +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__(); _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline barrier::arrival_token barrier_arrive_tx( barrier & __b, @@ -591,7 +588,7 @@ barrier::arrival_token barrier_arrive_tx( _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); barrier::arrival_token __token = {}; - NV_IF_TARGET( + NV_IF_ELSE_TARGET( // On architectures pre-sm90, arrive_tx is not supported. NV_PROVIDES_SM_90, ( // We do not check for the statespace of the barrier here. This is @@ -619,11 +616,47 @@ barrier::arrival_token barrier_arrive_tx( _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update ); } + ),( + __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__(); ) ); return __token; } +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__(); +_LIBCUDACXX_DEVICE inline +void barrier_expect_tx( + barrier & __b, + _CUDA_VSTD::ptrdiff_t __transaction_count_update) { + + _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); + + // We do not check for the statespace of the barrier here. This is + // on purpose. This allows debugging tools like memcheck/racecheck + // to detect that we are passing a pointer with the wrong state + // space to mbarrier.arrive. If we checked for the state space here, + // and __trap() if wrong, then those tools would not be able to help + // us in release builds. In debug builds, the error would be caught + // by the asserts at the top of this function. + NV_IF_ELSE_TARGET( + // On architectures pre-sm90, arrive_tx is not supported. + NV_PROVIDES_SM_90, ( + auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + asm ( + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) + : "memory"); + ),( + __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__(); + )); +} + +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( _Tp* __dest, @@ -643,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory."); _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory."); + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); if (__isShared(__dest) && __isGlobal(__src)) { asm volatile( @@ -660,36 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( // is not yet implemented. So we trap in this case as well. _LIBCUDACXX_UNREACHABLE(); } + ),( + __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__(); + )); return async_contract_fulfillment::async; } - -_LIBCUDACXX_DEVICE inline -void barrier_expect_tx( - barrier & __b, - _CUDA_VSTD::ptrdiff_t __transaction_count_update) { - - _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); - _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object - _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); - - // We do not check for the statespace of the barrier here. This is - // on purpose. This allows debugging tools like memcheck/racecheck - // to detect that we are passing a pointer with the wrong state - // space to mbarrier.arrive. If we checked for the state space here, - // and __trap() if wrong, then those tools would not be able to help - // us in release builds. In debug builds, the error would be caught - // by the asserts at the top of this function. - auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); - asm ( - "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" - : - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) - : "memory"); -} -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_ptx_isa >= 800 #endif // _CCCL_CUDA_COMPILER _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE @@ -763,6 +774,7 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD // The user is still responsible for arriving and waiting on (or otherwise // synchronizing with) the barrier or pipeline barrier to see the results of // copies from other threads participating in the synchronization object. +extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__(); struct __memcpy_completion_impl { template @@ -796,13 +808,17 @@ struct __memcpy_completion_impl { // bulk group to be used with shared memory barriers. _LIBCUDACXX_UNREACHABLE(); case __completion_mechanism::__mbarrier_complete_tx: +#if __cccl_isa_ptx >= 800 // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available. NV_IF_TARGET(NV_PROVIDES_SM_90, ( // Only perform the expect_tx operation with the leader thread if (__group.thread_rank() == 0) { ::cuda::device::barrier_expect_tx(__barrier, __size); } + ),( + __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__(); )); +#endif // __cccl_isa_ptx >= 800 return async_contract_fulfillment::async; case __completion_mechanism::__sync: // sync: In this case, we do not need to do anything. The user will have @@ -929,11 +945,13 @@ struct __memcpy_completion_impl { * 5. normal synchronous copy (fallback) ***********************************************************************/ -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); template inline __device__ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if (__g.thread_rank() == 0) { asm volatile( "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n" @@ -944,10 +962,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle))) : "memory"); } + ),( + __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); + )); } -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_ptx_isa >= 800 -#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); template inline __device__ void __cp_async_shared_global(char * __dest, const char * __src) { @@ -959,6 +980,7 @@ void __cp_async_shared_global(char * __dest, const char * __src) { static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16."); #endif // _CCCL_STD_VER >= 2017 + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( asm volatile( "cp.async.ca.shared.global [%0], [%1], %2, %2;" : @@ -966,6 +988,9 @@ void __cp_async_shared_global(char * __dest, const char * __src) { "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), "n"(_Copy_size) : "memory"); + ),( + __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); + )); } template <> @@ -973,6 +998,7 @@ inline __device__ void __cp_async_shared_global<16>(char * __dest, const char * __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async // When copying 16 bytes, it is possible to skip L1 cache (.cg). + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( asm volatile( "cp.async.cg.shared.global [%0], [%1], %2, %2;" : @@ -980,6 +1006,9 @@ void __cp_async_shared_global<16>(char * __dest, const char * __src) { "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), "n"(16) : "memory"); + ),( + __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); + )); } template @@ -1002,7 +1031,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char * __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset); } } -#endif // __CUDA_MINIMUM_ARCH__ template struct __copy_chunk { @@ -1083,6 +1111,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group template<_CUDA_VSTD::size_t _Align, typename _Group> _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) { +#if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx); _LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx."); @@ -1094,6 +1123,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _ } // Fallthrough to SM 80.. )); +#endif // __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_80, ( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index c16965d23b..6936733858 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -26,7 +26,6 @@ #include // __CUDA_MINIMUM_ARCH__ and friends -#include "../__cuda/ptx/ptx_isa_target_macros.h" #include "../__cuda/ptx/ptx_dot_variants.h" #include "../__cuda/ptx/ptx_helper_functions.h" #include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index dbd97e2b63..0666be92e6 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -26,7 +26,6 @@ #include "ptx_dot_variants.h" #include "ptx_helper_functions.h" -#include "ptx_isa_target_macros.h" #include "../../cstdint" _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version index c6d15ae72f..61be53c671 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/version +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version @@ -223,16 +223,6 @@ __cpp_lib_void_t 201411L #endif #endif // __cuda_std__ -// We define certain feature test macros depending on availability. When -// __CUDA_MINIMUM_ARCH__ is not available, we define the following features -// unconditionally. This permits checking for the feature in host code. When -// __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the -// hardware supports it. -#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) -# define __cccl_lib_local_barrier_arrive_tx -# define __cccl_lib_experimental_ctk12_cp_async_exposure -#endif - // We unconditionally define `__cccl_lib_meow` so that there is only one place to set the value #if _CCCL_STD_VER > 2011 # define __cccl_lib_chrono_udls 201304L diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp index 8f82188e41..69ed6bb41c 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp index 0d2f5ca841..55038a41f1 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp index bcf6dcf622..eded8d4e2a 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp index 1a40fcb78a..8729995fef 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp index b31278ead5..58b84d97de 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp @@ -28,9 +28,9 @@ int main(int, char**){ // barrier_arrive_tx should fail on SM70 and SM80, because it is hidden. auto token = cuda::device::barrier_arrive_tx(bar, 1, 0); -#if defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__ +#ifdef __cccl_lib_local_barrier_arrive_tx static_assert(false, "Fail manually for SM90 and up."); -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_lib_local_barrier_arrive_tx )); return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp index a6ac3803bd..8dcb6645a3 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp index 4cc310d958..14aa35116a 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp index d32aea8572..db061cb4f6 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp index e98ee5e6aa..72b0667fbf 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp index eb4e9b876d..8b77ee210b 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp index fb58b58315..7d68e887de 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp index 6d890edc9b..a289c71c2d 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp index e394515666..26e04f8d96 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp index b10c2bfc26..88f45093c4 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp index 8e7886db6a..0ba064cf93 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp index 8be14c21c1..af1f27454d 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h index 8d29fa6df1..6ac6e40db1 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h @@ -7,9 +7,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: libcpp-has-no-threads -// UNSUPPORTED: pre-sm-90 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp index 771eb6fde7..57b871e424 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp index 6da34c14af..c2e36ce997 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp index 5ec98c26ee..9a347e0e7d 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp index bf626a902b..b917e9e036 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp index 6614bfb51f..bd82a594ac 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 //