From e11885c41d35e0a48d91abf07d200c400bdf0cfa Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 21 Feb 2024 00:50:43 -0800 Subject: [PATCH 1/6] Fix ptx usage to account for PTX ISA availability (#1359) Currently we only guard those instructions based on the available architecture. However, it is also valid to compile with an old toolkit for a new machine. Consequently we need to strengthen our checks against available PTX ISA --- libcudacxx/include/cuda/barrier | 4 +- .../ptx_isa.h} | 68 ++++++++---- .../std/detail/libcxx/include/__cccl_config | 2 + .../detail/libcxx/include/__cuda/barrier.h | 104 +++++++++++------- .../std/detail/libcxx/include/__cuda/ptx.h | 1 - ..._and_communication_instructions_mbarrier.h | 9 +- .../cuda/std/detail/libcxx/include/version | 26 ++--- .../barrier/arrive_tx_cluster.runfail.cpp | 1 + .../cuda/barrier/arrive_tx_cta.pass.cpp | 1 + .../cuda/barrier/arrive_tx_device.runfail.cpp | 1 + .../barrier/arrive_tx_feature_test.pass.cpp | 1 + ...arrive_tx_static_asserts_pre_sm90.fail.cpp | 4 +- .../cuda/barrier/arrive_tx_thread.pass.cpp | 1 + .../cuda/barrier/arrive_tx_warp.pass.cpp | 1 + .../cuda/barrier/cp_async_bulk.pass.cpp | 1 + .../cp_async_bulk_feature_test.pass.cpp | 1 + .../cp_async_bulk_ptx_compiles.pass.cpp | 1 + .../barrier/cp_async_bulk_tensor.pass.cpp | 1 + .../barrier/cp_async_bulk_tensor_1d.pass.cpp | 1 + .../barrier/cp_async_bulk_tensor_2d.pass.cpp | 1 + .../barrier/cp_async_bulk_tensor_3d.pass.cpp | 1 + .../barrier/cp_async_bulk_tensor_4d.pass.cpp | 1 + .../barrier/cp_async_bulk_tensor_5d.pass.cpp | 1 + .../barrier/cp_async_bulk_tensor_generic.h | 3 - .../cuda/barrier/expect_tx_cta.pass.cpp | 1 + .../cuda/barrier/expect_tx_device.runfail.cpp | 1 + .../cuda/barrier/expect_tx_thread.pass.cpp | 1 + .../cuda/barrier/expect_tx_warp.pass.cpp | 1 + .../memcpy_async/memcpy_async_tx.pass.cpp | 1 + 29 files changed, 152 insertions(+), 89 deletions(-) rename libcudacxx/include/cuda/std/detail/libcxx/include/{__cuda/ptx/ptx_isa_target_macros.h => __cccl/ptx_isa.h} (58%) diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier index 15182816c0..3c21d0c9b1 100644 --- a/libcudacxx/include/cuda/barrier +++ b/libcudacxx/include/cuda/barrier @@ -40,7 +40,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL // capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is // necessary to prevent cudafe from ripping out the device functions before // device compilation begins. -#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) +#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk inline _LIBCUDACXX_DEVICE @@ -278,7 +278,7 @@ void cp_async_bulk_wait_group_read() : "memory"); } -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_lib_experimental_ctk12_cp_async_exposure _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h similarity index 58% rename from libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h rename to libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h index 5dcccc5d1f..d612c9e4d1 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h @@ -1,19 +1,15 @@ -// -*- C++ -*- //===----------------------------------------------------------------------===// // // Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// - -#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ -#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ - -#include // __CUDA_MINIMUM_ARCH__ and friends +#ifndef __CCCL_PTX_ISA_H_ +#define __CCCL_PTX_ISA_H_ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header @@ -23,6 +19,8 @@ # pragma system_header #endif // no system header +#include // __CUDA_MINIMUM_ARCH__ and friends + /* * Targeting macros * @@ -31,47 +29,75 @@ */ // PTX ISA 8.3 is available from CUDA 12.3, driver r545 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +// The first define is for future major versions of CUDACC. +// We make sure that these get the highest known PTX ISA version. +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 830ULL +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 830ULL // PTX ISA 8.2 is available from CUDA 12.2, driver r535 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 820ULL // PTX ISA 8.1 is available from CUDA 12.1, driver r530 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 810ULL // PTX ISA 8.0 is available from CUDA 12.0, driver r525 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 800ULL // PTX ISA 7.8 is available from CUDA 11.8, driver r520 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 780ULL // PTX ISA 7.7 is available from CUDA 11.7, driver r515 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 770ULL // PTX ISA 7.6 is available from CUDA 11.6, driver r510 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 760ULL // PTX ISA 7.5 is available from CUDA 11.5, driver r495 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 750ULL // PTX ISA 7.4 is available from CUDA 11.4, driver r470 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 740ULL // PTX ISA 7.3 is available from CUDA 11.3, driver r465 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 730ULL // PTX ISA 7.2 is available from CUDA 11.2, driver r460 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 720ULL // PTX ISA 7.1 is available from CUDA 11.1, driver r455 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 710ULL // PTX ISA 7.0 is available from CUDA 11.0, driver r445 -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \ + || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 700ULL // Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined. #else # define __cccl_ptx_isa 0ULL #endif -#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ +// We define certain feature test macros depending on availability. When +// __CUDA_MINIMUM_ARCH__ is not available, we define the following features +// depending on PTX ISA. This permits checking for the feature in host code. +// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the +// hardware supports it. +#if (!defined(__CUDA_MINIMUM_ARCH__)) \ + || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_ptx_isa >= 800 +# define __cccl_lib_local_barrier_arrive_tx +# define __cccl_lib_experimental_ctk12_cp_async_exposure +#endif + +#endif // __CCCL_PTX_ISA_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config index afdabbbb11..c1d2a381a5 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config @@ -187,6 +187,8 @@ # define _CCCL_NV_DIAG_DEFAULT(_WARNING) #endif // other compilers +#include "__cccl/ptx_isa.h" +#include "__cccl/version.h" #include "__cccl/visibility.h" #endif // __CCCL_CONFIG diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index d73d1c2dbb..d5015594f6 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -572,11 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier #if defined(_CCCL_CUDA_COMPILER) -// Hide arrive_tx when CUDA architecture is insufficient. Note the -// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function -// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__. -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) - +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__(); _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline barrier::arrival_token barrier_arrive_tx( barrier & __b, @@ -591,7 +588,7 @@ barrier::arrival_token barrier_arrive_tx( _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); barrier::arrival_token __token = {}; - NV_IF_TARGET( + NV_IF_ELSE_TARGET( // On architectures pre-sm90, arrive_tx is not supported. NV_PROVIDES_SM_90, ( // We do not check for the statespace of the barrier here. This is @@ -619,11 +616,47 @@ barrier::arrival_token barrier_arrive_tx( _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update ); } + ),( + __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__(); ) ); return __token; } +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__(); +_LIBCUDACXX_DEVICE inline +void barrier_expect_tx( + barrier & __b, + _CUDA_VSTD::ptrdiff_t __transaction_count_update) { + + _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); + + // We do not check for the statespace of the barrier here. This is + // on purpose. This allows debugging tools like memcheck/racecheck + // to detect that we are passing a pointer with the wrong state + // space to mbarrier.arrive. If we checked for the state space here, + // and __trap() if wrong, then those tools would not be able to help + // us in release builds. In debug builds, the error would be caught + // by the asserts at the top of this function. + NV_IF_ELSE_TARGET( + // On architectures pre-sm90, arrive_tx is not supported. + NV_PROVIDES_SM_90, ( + auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + asm ( + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) + : "memory"); + ),( + __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__(); + )); +} + +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( _Tp* __dest, @@ -643,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory."); _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory."); + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); if (__isShared(__dest) && __isGlobal(__src)) { asm volatile( @@ -660,36 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( // is not yet implemented. So we trap in this case as well. _LIBCUDACXX_UNREACHABLE(); } + ),( + __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__(); + )); return async_contract_fulfillment::async; } - -_LIBCUDACXX_DEVICE inline -void barrier_expect_tx( - barrier & __b, - _CUDA_VSTD::ptrdiff_t __transaction_count_update) { - - _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); - _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object - _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); - - // We do not check for the statespace of the barrier here. This is - // on purpose. This allows debugging tools like memcheck/racecheck - // to detect that we are passing a pointer with the wrong state - // space to mbarrier.arrive. If we checked for the state space here, - // and __trap() if wrong, then those tools would not be able to help - // us in release builds. In debug builds, the error would be caught - // by the asserts at the top of this function. - auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); - asm ( - "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" - : - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) - : "memory"); -} -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_ptx_isa >= 800 #endif // _CCCL_CUDA_COMPILER _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE @@ -763,6 +774,7 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD // The user is still responsible for arriving and waiting on (or otherwise // synchronizing with) the barrier or pipeline barrier to see the results of // copies from other threads participating in the synchronization object. +extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__(); struct __memcpy_completion_impl { template @@ -796,13 +808,17 @@ struct __memcpy_completion_impl { // bulk group to be used with shared memory barriers. _LIBCUDACXX_UNREACHABLE(); case __completion_mechanism::__mbarrier_complete_tx: +#if __cccl_ptx_isa >= 800 // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available. NV_IF_TARGET(NV_PROVIDES_SM_90, ( // Only perform the expect_tx operation with the leader thread if (__group.thread_rank() == 0) { ::cuda::device::barrier_expect_tx(__barrier, __size); } + ),( + __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__(); )); +#endif // __cccl_ptx_isa >= 800 return async_contract_fulfillment::async; case __completion_mechanism::__sync: // sync: In this case, we do not need to do anything. The user will have @@ -929,11 +945,13 @@ struct __memcpy_completion_impl { * 5. normal synchronous copy (fallback) ***********************************************************************/ -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); template inline __device__ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if (__g.thread_rank() == 0) { asm volatile( "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n" @@ -944,10 +962,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle))) : "memory"); } + ),( + __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); + )); } -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_ptx_isa >= 800 -#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); template inline __device__ void __cp_async_shared_global(char * __dest, const char * __src) { @@ -959,6 +980,7 @@ void __cp_async_shared_global(char * __dest, const char * __src) { static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16."); #endif // _LIBCUDACXX_STD_VER >= 17 + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( asm volatile( "cp.async.ca.shared.global [%0], [%1], %2, %2;" : @@ -966,6 +988,9 @@ void __cp_async_shared_global(char * __dest, const char * __src) { "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), "n"(_Copy_size) : "memory"); + ),( + __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); + )); } template <> @@ -973,6 +998,7 @@ inline __device__ void __cp_async_shared_global<16>(char * __dest, const char * __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async // When copying 16 bytes, it is possible to skip L1 cache (.cg). + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( asm volatile( "cp.async.cg.shared.global [%0], [%1], %2, %2;" : @@ -980,6 +1006,9 @@ void __cp_async_shared_global<16>(char * __dest, const char * __src) { "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), "n"(16) : "memory"); + ),( + __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); + )); } template @@ -1002,7 +1031,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char * __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset); } } -#endif // __CUDA_MINIMUM_ARCH__ template struct __copy_chunk { @@ -1083,6 +1111,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group template<_CUDA_VSTD::size_t _Align, typename _Group> _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) { +#if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx); _LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx."); @@ -1094,6 +1123,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _ } // Fallthrough to SM 80.. )); +#endif // __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_80, ( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 61d904fb5f..20ffafde10 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -26,7 +26,6 @@ #include // __CUDA_MINIMUM_ARCH__ and friends -#include "../__cuda/ptx/ptx_isa_target_macros.h" #include "../__cuda/ptx/ptx_dot_variants.h" #include "../__cuda/ptx/ptx_helper_functions.h" #include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 6b42a15128..6f7f1d7358 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -12,11 +12,6 @@ #ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ #define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ -#include "ptx_dot_variants.h" -#include "ptx_helper_functions.h" -#include "ptx_isa_target_macros.h" -#include "../../cstdint" - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -25,6 +20,10 @@ # pragma system_header #endif // no system header +#include "ptx_dot_variants.h" +#include "ptx_helper_functions.h" +#include "../../cstdint" + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version index 449993d7ef..721044b9f1 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/version +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version @@ -203,6 +203,14 @@ __cpp_lib_void_t 201411L #include <__config> #endif // __cuda_std__ +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + // If we are replacing the stl we need to define our own feature test macros. // However, if we are in __cuda_std__ we need to define our own symbols to not // conflict with the host stl. @@ -215,24 +223,6 @@ __cpp_lib_void_t 201411L #endif #endif // __cuda_std__ -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -// We define certain feature test macros depending on availability. When -// __CUDA_MINIMUM_ARCH__ is not available, we define the following features -// unconditionally. This permits checking for the feature in host code. When -// __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the -// hardware supports it. -#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) -# define __cccl_lib_local_barrier_arrive_tx -# define __cccl_lib_experimental_ctk12_cp_async_exposure -#endif - // We unconditionally define `__cccl_lib_meow` so that there is only one place to set the value #if _LIBCUDACXX_STD_VER > 11 # define __cccl_lib_chrono_udls 201304L diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp index 8f82188e41..69ed6bb41c 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp index 0d2f5ca841..55038a41f1 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp index bcf6dcf622..eded8d4e2a 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp index 1a40fcb78a..8729995fef 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp index b31278ead5..58b84d97de 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp @@ -28,9 +28,9 @@ int main(int, char**){ // barrier_arrive_tx should fail on SM70 and SM80, because it is hidden. auto token = cuda::device::barrier_arrive_tx(bar, 1, 0); -#if defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__ +#ifdef __cccl_lib_local_barrier_arrive_tx static_assert(false, "Fail manually for SM90 and up."); -#endif // __CUDA_MINIMUM_ARCH__ +#endif // __cccl_lib_local_barrier_arrive_tx )); return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp index a6ac3803bd..8dcb6645a3 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp index 4cc310d958..14aa35116a 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp index d32aea8572..db061cb4f6 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp index e98ee5e6aa..72b0667fbf 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp index eb4e9b876d..8b77ee210b 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp index fb58b58315..7d68e887de 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp index 6d890edc9b..a289c71c2d 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp index e394515666..26e04f8d96 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp index b10c2bfc26..88f45093c4 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp index 8e7886db6a..0ba064cf93 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp index 8be14c21c1..af1f27454d 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: c++11 // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // UNSUPPORTED: nvrtc // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h index 8d29fa6df1..6ac6e40db1 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h @@ -7,9 +7,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: libcpp-has-no-threads -// UNSUPPORTED: pre-sm-90 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp index 771eb6fde7..57b871e424 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp index 6da34c14af..c2e36ce997 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp index 5ec98c26ee..9a347e0e7d 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp index bf626a902b..b917e9e036 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp index 6614bfb51f..bd82a594ac 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp @@ -10,6 +10,7 @@ // // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 +// UNSUPPORTED: nvcc-11 // From a4234b24c245c06e19e9dec0bff3268dbfe50bc7 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 21 Feb 2024 11:53:39 +0100 Subject: [PATCH 2/6] Do not use VLAs in `cp_async_bulk_tensor_*` tests VLAs are a compiler extension and are correctly errored out by some compilers. As we always know the exact size of the array anyway just swtich to a `cuda::std::array` Fixes nvbug4476664 --- .../barrier/cp_async_bulk_tensor_1d.pass.cpp | 19 ++-- .../barrier/cp_async_bulk_tensor_2d.pass.cpp | 19 ++-- .../barrier/cp_async_bulk_tensor_3d.pass.cpp | 19 ++-- .../barrier/cp_async_bulk_tensor_4d.pass.cpp | 19 ++-- .../barrier/cp_async_bulk_tensor_5d.pass.cpp | 19 ++-- .../barrier/cp_async_bulk_tensor_generic.h | 90 +++++++++---------- 6 files changed, 96 insertions(+), 89 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp index a289c71c2d..dad966ded7 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp @@ -17,6 +17,9 @@ // +#include +#include + #include "cp_async_bulk_tensor_generic.h" // Define the size of contiguous tensor in global and shared memory. @@ -26,21 +29,20 @@ // offset. // // We have a separate variable for host and device because a constexpr -// std::initializer_list cannot be shared between host and device as some of its +// cuda::std::array cannot be shared between host and device as some of its // member functions take a const reference, which is unsupported by nvcc. - constexpr std::initializer_list GMEM_DIMS {256}; -__device__ constexpr std::initializer_list GMEM_DIMS_DEV{256}; - constexpr std::initializer_list SMEM_DIMS {32}; -__device__ constexpr std::initializer_list SMEM_DIMS_DEV{32}; + constexpr cuda::std::array GMEM_DIMS {256}; +__device__ constexpr cuda::std::array GMEM_DIMS_DEV{256}; + constexpr cuda::std::array SMEM_DIMS {32}; +__device__ constexpr cuda::std::array SMEM_DIMS_DEV{32}; -__device__ constexpr std::initializer_list TEST_SMEM_COORDS[] = { +__device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { {0}, {4}, {8} }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); -constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -54,9 +56,10 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); + (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp index 26e04f8d96..4d55b4fbbc 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp @@ -17,6 +17,9 @@ // +#include +#include + #include "cp_async_bulk_tensor_generic.h" // Define the size of contiguous tensor in global and shared memory. @@ -26,14 +29,14 @@ // offset. // // We have a separate variable for host and device because a constexpr -// std::initializer_list cannot be shared between host and device as some of its +// cuda::std::array cannot be shared between host and device as some of its // member functions take a const reference, which is unsupported by nvcc. - constexpr std::initializer_list GMEM_DIMS {8, 11}; -__device__ constexpr std::initializer_list GMEM_DIMS_DEV{8, 11}; - constexpr std::initializer_list SMEM_DIMS {4, 2}; -__device__ constexpr std::initializer_list SMEM_DIMS_DEV{4, 2}; + constexpr cuda::std::array GMEM_DIMS {8, 11}; +__device__ constexpr cuda::std::array GMEM_DIMS_DEV{8, 11}; + constexpr cuda::std::array SMEM_DIMS {4, 2}; +__device__ constexpr cuda::std::array SMEM_DIMS_DEV{4, 2}; -__device__ constexpr std::initializer_list TEST_SMEM_COORDS[] = { +__device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { {0, 0}, {4, 1}, {4, 5}, @@ -41,7 +44,6 @@ __device__ constexpr std::initializer_list TEST_SMEM_COORDS[] = { }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); -constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -55,9 +57,10 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); + (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp index 88f45093c4..051486be45 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp @@ -17,6 +17,9 @@ // +#include +#include + #include "cp_async_bulk_tensor_generic.h" // Define the size of contiguous tensor in global and shared memory. @@ -26,21 +29,20 @@ // offset. // // We have a separate variable for host and device because a constexpr -// std::initializer_list cannot be shared between host and device as some of its +// cuda::std::array cannot be shared between host and device as some of its // member functions take a const reference, which is unsupported by nvcc. - constexpr std::initializer_list GMEM_DIMS {8, 11, 13}; -__device__ constexpr std::initializer_list GMEM_DIMS_DEV{8, 11, 13}; - constexpr std::initializer_list SMEM_DIMS {4, 2, 4}; -__device__ constexpr std::initializer_list SMEM_DIMS_DEV{4, 2, 4}; + constexpr cuda::std::array GMEM_DIMS {8, 11, 13}; +__device__ constexpr cuda::std::array GMEM_DIMS_DEV{8, 11, 13}; + constexpr cuda::std::array SMEM_DIMS {4, 2, 4}; +__device__ constexpr cuda::std::array SMEM_DIMS_DEV{4, 2, 4}; -__device__ constexpr std::initializer_list TEST_SMEM_COORDS[] = { +__device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { {0, 0, 0}, {4, 1, 3}, {4, 5, 1} }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); -constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -54,9 +56,10 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); + (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp index 0ba064cf93..78f8da4a87 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp @@ -17,6 +17,9 @@ // +#include +#include + #include "cp_async_bulk_tensor_generic.h" // Define the size of contiguous tensor in global and shared memory. @@ -26,14 +29,14 @@ // offset. // // We have a separate variable for host and device because a constexpr -// std::initializer_list cannot be shared between host and device as some of its +// cuda::std::array cannot be shared between host and device as some of its // member functions take a const reference, which is unsupported by nvcc. - constexpr std::initializer_list GMEM_DIMS {8, 11, 13, 3}; -__device__ constexpr std::initializer_list GMEM_DIMS_DEV{8, 11, 13, 3}; - constexpr std::initializer_list SMEM_DIMS {4, 2, 4, 1}; -__device__ constexpr std::initializer_list SMEM_DIMS_DEV{4, 2, 4, 1}; + constexpr cuda::std::array GMEM_DIMS {8, 11, 13, 3}; +__device__ constexpr cuda::std::array GMEM_DIMS_DEV{8, 11, 13, 3}; + constexpr cuda::std::array SMEM_DIMS {4, 2, 4, 1}; +__device__ constexpr cuda::std::array SMEM_DIMS_DEV{4, 2, 4, 1}; -__device__ constexpr std::initializer_list TEST_SMEM_COORDS[] = { +__device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { {0, 0, 0, 0}, {4, 1, 3, 0}, {4, 8, 7, 2}, @@ -41,7 +44,6 @@ __device__ constexpr std::initializer_list TEST_SMEM_COORDS[] = { }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); -constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -55,9 +57,10 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); + (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp index af1f27454d..5069803d2a 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp @@ -17,6 +17,9 @@ // +#include +#include + #include "cp_async_bulk_tensor_generic.h" // Define the size of contiguous tensor in global and shared memory. @@ -26,21 +29,20 @@ // offset. // // We have a separate variable for host and device because a constexpr -// std::initializer_list cannot be shared between host and device as some of its +// cuda::std::array cannot be shared between host and device as some of its // member functions take a const reference, which is unsupported by nvcc. - constexpr std::initializer_list GMEM_DIMS {8, 11, 13, 3, 3}; -__device__ constexpr std::initializer_list GMEM_DIMS_DEV{8, 11, 13, 3, 3}; - constexpr std::initializer_list SMEM_DIMS {4, 2, 4, 1, 1}; -__device__ constexpr std::initializer_list SMEM_DIMS_DEV{4, 2, 4, 1, 1}; + constexpr cuda::std::array GMEM_DIMS {8, 11, 13, 3, 3}; +__device__ constexpr cuda::std::array GMEM_DIMS_DEV{8, 11, 13, 3, 3}; + constexpr cuda::std::array SMEM_DIMS {4, 2, 4, 1, 1}; +__device__ constexpr cuda::std::array SMEM_DIMS_DEV{4, 2, 4, 1, 1}; -__device__ constexpr std::initializer_list TEST_SMEM_COORDS[] = { +__device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { {0, 0, 0, 0, 0}, {4, 1, 3, 0, 1}, {4, 5, 1, 1, 2} }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); -constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -54,9 +56,10 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); + (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h index 6ac6e40db1..83be47f005 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h @@ -14,6 +14,7 @@ #define TEST_CP_ASYNC_BULK_TENSOR_GENERIC_H_ #include +#include #include // cuda::std::move #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS @@ -37,24 +38,26 @@ namespace cde = cuda::device::experimental; */ // Compute the total number of elements in a tensor -constexpr __host__ __device__ int tensor_len(std::initializer_list dims) { - int len = 1; +template +constexpr __host__ __device__ int tensor_len(cuda::std::array dims) { + T len = 1; for (int d : dims) { len *= d; } - return len; + return static_cast(len); } // Function to convert: // a linear index into a shared memory tensor // into // a linear index into a global memory tensor. +template inline __device__ int smem_lin_idx_to_gmem_lin_idx( int smem_lin_idx, - std::initializer_list smem_coord, - std::initializer_list smem_dims, - std::initializer_list gmem_dims) { + cuda::std::array smem_coord, + cuda::std::array smem_dims, + cuda::std::array gmem_dims) { assert(smem_coord.size() == smem_dims.size()); assert(smem_coord.size() == gmem_dims.size()); @@ -70,38 +73,36 @@ int smem_lin_idx_to_gmem_lin_idx( return gmem_lin_idx; } +template __device__ inline void cp_tensor_global_to_shared( CUtensorMap* tensor_map, - std::initializer_list indices, + cuda::std::array indices, void *smem, barrier &bar) { - const int* idxs = indices.begin(); - switch (indices.size()) { - case 1: cde::cp_async_bulk_tensor_1d_global_to_shared(smem, tensor_map, idxs[0], bar); break; - case 2: cde::cp_async_bulk_tensor_2d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], bar); break; - case 3: cde::cp_async_bulk_tensor_3d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], idxs[2], bar); break; - case 4: cde::cp_async_bulk_tensor_4d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], bar); break; - case 5: cde::cp_async_bulk_tensor_5d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], idxs[4], bar); break; + case 1: cde::cp_async_bulk_tensor_1d_global_to_shared(smem, tensor_map, indices[0], bar); break; + case 2: cde::cp_async_bulk_tensor_2d_global_to_shared(smem, tensor_map, indices[0], indices[1], bar); break; + case 3: cde::cp_async_bulk_tensor_3d_global_to_shared(smem, tensor_map, indices[0], indices[1], indices[2], bar); break; + case 4: cde::cp_async_bulk_tensor_4d_global_to_shared(smem, tensor_map, indices[0], indices[1], indices[2], indices[3], bar); break; + case 5: cde::cp_async_bulk_tensor_5d_global_to_shared(smem, tensor_map, indices[0], indices[1], indices[2], indices[3], indices[4], bar); break; default: assert(false && "Wrong number of dimensions."); } } +template __device__ inline void cp_tensor_shared_to_global( CUtensorMap* tensor_map, - std::initializer_list indices, + cuda::std::array indices, void *smem) { - const int* idxs = indices.begin(); - switch (indices.size()) { - case 1: cde::cp_async_bulk_tensor_1d_shared_to_global(tensor_map, idxs[0], smem); break; - case 2: cde::cp_async_bulk_tensor_2d_shared_to_global(tensor_map, idxs[0], idxs[1], smem); break; - case 3: cde::cp_async_bulk_tensor_3d_shared_to_global(tensor_map, idxs[0], idxs[1], idxs[2], smem); break; - case 4: cde::cp_async_bulk_tensor_4d_shared_to_global(tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], smem); break; - case 5: cde::cp_async_bulk_tensor_5d_shared_to_global(tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], idxs[4], smem); break; + case 1: cde::cp_async_bulk_tensor_1d_shared_to_global(tensor_map, indices[0], smem); break; + case 2: cde::cp_async_bulk_tensor_2d_shared_to_global(tensor_map, indices[0], indices[1], smem); break; + case 3: cde::cp_async_bulk_tensor_3d_shared_to_global(tensor_map, indices[0], indices[1], indices[2], smem); break; + case 4: cde::cp_async_bulk_tensor_4d_shared_to_global(tensor_map, indices[0], indices[1], indices[2], indices[3], smem); break; + case 5: cde::cp_async_bulk_tensor_5d_shared_to_global(tensor_map, indices[0], indices[1], indices[2], indices[3], indices[4], smem); break; default: assert(false && "Wrong number of dimensions."); } @@ -129,10 +130,10 @@ __constant__ fake_cutensormap global_fake_tensor_map; * 5. It writes the tile back to global memory * 6. It checks that all the values in global are properly modified. */ -template -__device__ void test(std::initializer_list smem_coord, - std::initializer_list smem_dims, - std::initializer_list gmem_dims, +template +__device__ void test(cuda::std::array smem_coord, + cuda::std::array smem_dims, + cuda::std::array gmem_dims, int* gmem_tensor, int gmem_len) { @@ -208,33 +209,24 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() { #endif #ifndef TEST_COMPILER_NVRTC -template -CUtensorMap map_encode(T *tensor_ptr, std::initializer_list gmem_dims, std::initializer_list smem_dims) { +template +CUtensorMap map_encode(T *tensor_ptr, const cuda::std::array& gmem_dims, const cuda::std::array& smem_dims) { // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html CUtensorMap tensor_map{}; - assert(gmem_dims.size() == smem_dims.size()); - // rank is the number of dimensions of the array. - int rank = gmem_dims.size(); - uint64_t size[rank]; - for (int i = 0; i < rank; ++i) { - size[i] = gmem_dims.begin()[i]; - } + // The stride is the number of bytes to traverse from the first element of one row to the next. // It must be a multiple of 16. - uint64_t stride[rank - 1]; - int base_stride = sizeof(T); - for (int i = 0; i < rank - 1; ++i) { - base_stride *= gmem_dims.begin()[i]; + uint64_t stride[num_dims - 1]; + uint64_t base_stride = sizeof(T); + for (size_t i = 0; i < num_dims - 1; ++i) { + base_stride *= gmem_dims[i]; stride[i] = base_stride; } - // The box_size is the size of the shared memory buffer that is used as the - // destination of a TMA transfer. Casting from int -> uint32_t. - const uint32_t *box_size = reinterpret_cast(smem_dims.begin()); // The distance between elements in units of sizeof(element). A stride of 2 // can be used to load only the real component of a complex-valued tensor, for instance. - uint32_t elem_stride[rank]; // = {1, .., 1}; - for (int i = 0; i < rank; ++i) { + uint32_t elem_stride[num_dims]; // = {1, .., 1}; + for (size_t i = 0; i < num_dims; ++i) { elem_stride[i] = 1; } @@ -245,11 +237,11 @@ CUtensorMap map_encode(T *tensor_ptr, std::initializer_list gmem_dims, std: CUresult res = cuTensorMapEncodeTiled( &tensor_map, // CUtensorMap *tensorMap, CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32, - rank, // cuuint32_t tensorRank, + num_dims, // cuuint32_t tensorRank, tensor_ptr, // void *globalAddress, - size, // const cuuint64_t *globalDim, + gmem_dims.data(), // const cuuint64_t *globalDim, stride, // const cuuint64_t *globalStrides, - box_size, // const cuuint32_t *boxDim, + smem_dims.data(), // const cuuint32_t *boxDim, elem_stride, // const cuuint32_t *elementStrides, CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE, CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE, @@ -261,8 +253,8 @@ CUtensorMap map_encode(T *tensor_ptr, std::initializer_list gmem_dims, std: return tensor_map; } -template -void init_tensor_map(const T& gmem_tensor_symbol, std::initializer_list gmem_dims, std::initializer_list smem_dims) { +template +void init_tensor_map(const T& gmem_tensor_symbol, const cuda::std::array& gmem_dims, const cuda::std::array& smem_dims) { // Get pointer to gmem_tensor to create tensor map. int * tensor_ptr = nullptr; auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor_symbol); From 868d9009891e27784b208f5fb4943cb5db7f69a9 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 21 Feb 2024 14:41:11 +0100 Subject: [PATCH 3/6] Use proper shared memory size Authored-by: Allard Hendriksen --- .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp | 4 ++-- .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp | 4 ++-- .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp | 4 ++-- .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp | 4 ++-- .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp | 4 ++-- .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp index dad966ded7..e6c3a13818 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp @@ -43,6 +43,7 @@ __device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); +constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -56,10 +57,9 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); - (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp index 4d55b4fbbc..4f2e21b33e 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp @@ -44,6 +44,7 @@ __device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); +constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -57,10 +58,9 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); - (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp index 051486be45..925f4084a5 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp @@ -43,6 +43,7 @@ __device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); +constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -56,10 +57,9 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); - (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp index 78f8da4a87..60d820561f 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp @@ -44,6 +44,7 @@ __device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); +constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -57,10 +58,9 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); - (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp index 5069803d2a..0a395156d4 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp @@ -43,6 +43,7 @@ __device__ constexpr cuda::std::array TEST_SMEM_COORDS[] = { }; constexpr size_t gmem_len = tensor_len(GMEM_DIMS); +constexpr size_t smem_len = tensor_len(SMEM_DIMS); __device__ int gmem_tensor[gmem_len]; @@ -56,10 +57,9 @@ int main(int, char**) ), NV_IS_DEVICE, ( for (auto smem_coord : TEST_SMEM_COORDS) { - test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); + test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); } ) ); - (void)SMEM_DIMS; return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h index 83be47f005..97226b7bfd 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h @@ -131,7 +131,7 @@ __constant__ fake_cutensormap global_fake_tensor_map; * 6. It checks that all the values in global are properly modified. */ template -__device__ void test(cuda::std::array smem_coord, +__device__ void test(cuda::std::array smem_coord, cuda::std::array smem_dims, cuda::std::array gmem_dims, int* gmem_tensor, From 01f2d5f95f43985e5ef5df560916b188f33ad5e8 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 22 Feb 2024 21:10:50 +0100 Subject: [PATCH 4/6] Fix incorrect linker issue --- .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index d5015594f6..d8710e5f18 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -774,7 +774,6 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD // The user is still responsible for arriving and waiting on (or otherwise // synchronizing with) the barrier or pipeline barrier to see the results of // copies from other threads participating in the synchronization object. -extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__(); struct __memcpy_completion_impl { template @@ -815,8 +814,6 @@ struct __memcpy_completion_impl { if (__group.thread_rank() == 0) { ::cuda::device::barrier_expect_tx(__barrier, __size); } - ),( - __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__(); )); #endif // __cccl_ptx_isa >= 800 return async_contract_fulfillment::async; From c93c5bcc0f131497c8b32e6c311eaea9cd46c07c Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 22 Feb 2024 21:17:18 +0100 Subject: [PATCH 5/6] Ensure runfail tests do not fail without execution --- .../test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp | 1 + .../test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp | 1 + .../test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp index 69ed6bb41c..d534da2bf5 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 // UNSUPPORTED: nvcc-11 +// UNSUPPORTED: no_execute // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp index eded8d4e2a..c7282517c3 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 // UNSUPPORTED: nvcc-11 +// UNSUPPORTED: no_execute // diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp index c2e36ce997..36aa9aa8ea 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp @@ -11,6 +11,7 @@ // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: pre-sm-90 // UNSUPPORTED: nvcc-11 +// UNSUPPORTED: no_execute // From 40ea920f0ef2420d5277788306eacf12b28a2711 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 23 Feb 2024 10:14:37 +0000 Subject: [PATCH 6/6] Ensure that __cccl_ptx_isa properly guards feature flags --- .../include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h index d612c9e4d1..68e218f29f 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h @@ -94,10 +94,12 @@ // depending on PTX ISA. This permits checking for the feature in host code. // When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the // hardware supports it. +#if __cccl_ptx_isa >= 800 #if (!defined(__CUDA_MINIMUM_ARCH__)) \ - || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_ptx_isa >= 800 + || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) # define __cccl_lib_local_barrier_arrive_tx # define __cccl_lib_experimental_ctk12_cp_async_exposure #endif +#endif // __cccl_ptx_isa >= 800 #endif // __CCCL_PTX_ISA_H_