diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h new file mode 100644 index 0000000000..9fcc2130dd --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +#ifndef TEST_ARRIVE_TX_H_ +#define TEST_ARRIVE_TX_H_ + +#include + +#include + +#include "concurrent_agents.h" +#include "cuda_space_selector.h" +#include "test_macros.h" + +// Suppress warning about barrier in shared memory +TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init) + +template +inline __device__ +void mbarrier_complete_tx( + Barrier &b, int transaction_count) +{ + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, ( + if (__isShared(cuda::device::barrier_native_handle(b))) { + asm volatile( + "mbarrier.complete_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"((unsigned int) __cvta_generic_to_shared(cuda::device::barrier_native_handle(b))), + "r"(transaction_count) + : "memory"); + } else { + __trap(); + } + ), NV_ANY_TARGET, ( + // On architectures pre-SM90 (and on host), we drop the transaction count + // update. The barriers do not keep track of transaction counts. + __trap(); + ) + ); +} + +template +__device__ +void thread(Barrier& b, int arrives_per_thread) +{ + constexpr int tx_count = 1; + auto tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count); + // Manually increase the transaction count of the barrier. + mbarrier_complete_tx(b, tx_count); + + b.wait(cuda::std::move(tok)); +} + +__device__ +void test() +{ + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + // Run all threads, each arriving with arrival count 1 + constexpr auto block = cuda::thread_scope_block; + + __shared__ cuda::barrier bar_1; + init(&bar_1, (int) blockDim.x); + __syncthreads(); + thread(bar_1, 1); + + // Run all threads, each arriving with arrival count 2 + __shared__ cuda::barrier bar_2; + init(&bar_2, (int) 2 * blockDim.x); + __syncthreads(); + thread(bar_2, 2); + ) + ); +} + +#endif // TEST_ARRIVE_TX_H_ diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cluster.runfail.cpp new file mode 100644 index 0000000000..8f82188e41 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cluster.runfail.cpp @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include +#include +#include "test_macros.h" + +// Suppress warning about barrier in shared memory +TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init) + +int main(int, char**){ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // When PR #416 is merged, uncomment this line: + // cuda_cluster_size = 2; + ), + NV_IS_DEVICE, ( + __shared__ cuda::barrier bar; + + if (threadIdx.x == 0) { + init(&bar, blockDim.x); + } + namespace cg = cooperative_groups; + auto cluster = cg::this_cluster(); + + cluster.sync(); + + // This test currently fails at this point because support for + // clusters has not yet been added. + cuda::barrier *remote_bar; + remote_bar = cluster.map_shared_rank(&bar, cluster.block_rank() ^ 1); + + // When PR #416 is merged, this should fail here because the barrier + // is in device memory. + auto token = cuda::device::barrier_arrive_tx(*remote_bar, 1, 0); + )); + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp new file mode 100644 index 0000000000..9c67e23c6f --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include "arrive_tx.h" + +int main(int, char**) +{ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // Required by concurrent_agents_launch to know how many we're + // launching. This can only be an int, because the nvrtc tests use grep + // to figure out how many threads to launch. + cuda_thread_count = 256; + ), + NV_IS_DEVICE, ( + test(); + ) + ); + + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_device.runfail.cpp new file mode 100644 index 0000000000..bcf6dcf622 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_device.runfail.cpp @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include +#include "test_macros.h" + +// Suppress warning about barrier in shared memory +TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init) + +__device__ uint64_t bar_storage; + +int main(int, char**){ + NV_IF_TARGET( + NV_IS_DEVICE, ( + cuda::barrier *bar_ptr; + bar_ptr = reinterpret_cast *>(bar_storage); + + if (threadIdx.x == 0) { + init(bar_ptr, blockDim.x); + } + __syncthreads(); + + // Should fail because the barrier is in device memory. + auto token = cuda::device::barrier_arrive_tx(*bar_ptr, 1, 0); + )); + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_feature_test.pass.cpp new file mode 100644 index 0000000000..1a40fcb78a --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_feature_test.pass.cpp @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include + +#ifndef __cccl_lib_local_barrier_arrive_tx +static_assert(false, "should define __cccl_lib_local_barrier_arrive_tx"); +#endif // __cccl_lib_local_barrier_arrive_tx + +int main(int, char**){ + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp new file mode 100644 index 0000000000..d0102acec6 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: nvrtc +// UNSUPPORTED: pre-sm-70 + +// + +#include + +int main(int, char**){ + NV_IF_TARGET( + NV_IS_DEVICE, ( + __shared__ cuda::barrier bar; + if (threadIdx.x == 0) { + init(&bar, blockDim.x); + } + __syncthreads(); + + // barrier_arrive_tx should fail on SM70 and SM80, because it is hidden. + auto token = cuda::device::barrier_arrive_tx(bar, 1, 0); + +#if defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__ + static_assert(false, "Fail manually for SM90 and up."); +#endif // __CUDA_MINIMUM_ARCH__ + )); + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp new file mode 100644 index 0000000000..d1ac1d66de --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include "arrive_tx.h" + +int main(int, char**) +{ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // Required by concurrent_agents_launch to know how many we're + // launching. This can only be an int, because the nvrtc tests use grep + // to figure out how many threads to launch. + cuda_thread_count = 2; + ), + NV_IS_DEVICE, ( + test(); + ) + ); + + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp new file mode 100644 index 0000000000..0c75474e17 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include "arrive_tx.h" + +int main(int, char**) +{ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // Required by concurrent_agents_launch to know how many we're + // launching. This can only be an int, because the nvrtc tests use grep + // to figure out how many threads to launch. + cuda_thread_count = 32; + ), + NV_IS_DEVICE, ( + test(); + ) + ); + + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier_init.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/init.pass.cpp similarity index 100% rename from libcudacxx/.upstream-tests/test/cuda/barrier_init.pass.cpp rename to libcudacxx/.upstream-tests/test/cuda/barrier/init.pass.cpp diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier_native_handle.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp similarity index 100% rename from libcudacxx/.upstream-tests/test/cuda/barrier_native_handle.pass.cpp rename to libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md index c20977c99b..3b057bf6c7 100644 --- a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md +++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md @@ -21,8 +21,9 @@ It has the same interface and semantics as [`cuda::std::barrier`], with the ## Barrier Operations -| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` | -| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function template)` | +| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` | +| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)` | +| [`cuda::device::barrier_arrive_tx`] | Arrive on a `cuda::barrier` with transaction count update. `(function)` | ## NVCC `__shared__` Initialization Warnings @@ -60,6 +61,41 @@ Programs shall ensure that this transformation would not introduce errors, for Under CUDA Compute Capability 6 (Pascal) or prior, an object of type `cuda::barrier` or `cuda::std::barrier` may not be used. +## Shared memory barriers with transaction count + +In addition to the arrival count, a `cuda::barrier` object +located in shared memory supports a +[tx-count](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object), +which is used for tracking the completion of some asynchronous memory operations or +transactions. +The tx-count tracks the number of asynchronous transactions, in +units specified by the asynchronous memory operation (typically bytes), that are +outstanding and yet to be complete. +This capability is exposed, starting with the Hopper architecture (CUDA Compute +Capability 9). + +The tx-count of `cuda::barrier` must be set to the total amount of asynchronous +memory operations, in units as specified by the asynchronous operations, to be +tracked by the current phase. This can be achieved with the +`cuda::device::barrier_arrive_tx` function call. Upon completion of each of the +asynchronous operations, the tx-count of the `cuda::barrier` will be updated and +thus progress the `cuda::barrier` towards the completion of the current phase. +This may complete the current phase. + +### Phase Completion of a `cuda::barrier` with tx-count support + +Modify [[thread.barrier.class]](http://eel.is/c++draft/thread.barrier.class) as follows: + +> A barrier is a thread coordination mechanism whose lifetime consists of a sequence of barrier phases, where each phase allows at most an expected number of threads to block until the expected number of threads **and the expected number of transaction-based asynchronous operations** arrive at the barrier. +> Each _barrier phase_ consists of the following steps: +> +> 1. The _expected count_ is decremented by each call to `arrive`,`arrive_and_drop`**, or `cuda::device::barrier_arrive_tx`**. +> 2. **The _transaction count_ is incremented by each call to `cuda::device::barrier_arrive_tx` and decremented by the completion of transaction-based asynchronous operations such as `cuda::memcpy_async_tx`.** +> 3. Exactly once after **both** the _expected count_ **and the _transaction count_** reach zero, a thread executes the _completion step_ during its call to `arrive`, `arrive_and_drop`, or `wait`, except that it is implementation-defined whether the step executes if no thread calls `wait`. +> 4. When the completion step finishes, the _expected count_ is reset to what was specified by the `expected` argument to the constructor, possibly adjusted by calls to `arrive_and_drop`, **the _transaction count_ is reset to zero,** and the next phase starts. +> +> Concurrent invocations of the member functions of barrier **and the non-member barrier APIs in `cuda::device`**, other than its destructor, do not introduce data races. The member functions `arrive` and `arrive_and_drop`, **and the non-member function `cuda::device::barrier_arrive_tx`**, execute atomically. + ## Implementation-Defined Behavior For each [`cuda::thread_scope`] `S` and `CompletionFunction` `F`, the value of @@ -98,6 +134,7 @@ __global__ void example_kernel() { [`cuda::barrier::init`]: ./barrier/init.md [`cuda::device::barrier_native_handle`]: ./barrier/barrier_native_handle.md +[`cuda::device::barrier_arrive_tx`]: ./barrier/barrier_arrive_tx.md [`cuda::std::barrier`]: https://en.cppreference.com/w/cpp/thread/barrier diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md new file mode 100644 index 0000000000..cc311ec46f --- /dev/null +++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md @@ -0,0 +1,80 @@ +--- +grand_parent: Extended API +parent: Barriers +--- + +# `cuda::device::barrier_arrive_tx` + +Defined in header ``: + +```cuda +__device__ +cuda::barrier::arrival_token +cuda::device::barrier_arrive_tx( + cuda::barrier& bar, + ptrdiff_t arrive_count_update, + ptrdiff_t transaction_count_update); +``` + +Arrives at a barrier in shared memory, updating both the arrival count and +transaction count. + +## Preconditions + +* `__isShared(&bar) == true` +* `1 <= arrive_count_update && transaction_count_update <= (1 << 20) - 1` +* `0 <= transaction_count_update && transaction_count_update <= (1 << 20) - 1` + + +## Effects + +* This function constructs an arrival_token object associated with the phase + synchronization point for the current phase. Then, decrements the expected + arrival count by `arrive_count_update` and increments the expected transaction + count by `transaction_count_update`. +* This function executes atomically. The call to this function strongly + happens-before the start of the phase completion step for the current phase. + +## Notes + +This function can only be used under CUDA Compute Capability 9.0 (Hopper) or +higher. + +To check if `cuda::device::barrier_arrive_tx` is available, use the +`__cccl_lib_local_barrier_arrive_tx` feature flag, as shown in the example code +below. + +## Return Value + +`cuda::device::barrier_arrive_tx` returns the constructed `arrival_token` object. + +## Example + +```cuda +#include +#include // cuda::std::move + +#ifndef __cccl_lib_local_barrier_arrive_tx +static_assert(false, "Insufficient libcu++ version: cuda::device::arrive_tx is not yet available."); +#endif // __cccl_lib_local_barrier_arrive_tx + +__global__ void example_kernel() { + __shared__ cuda::barrier bar; + if (threadIdx.x == 0) { + init(&bar, blockDim.x); + } + __syncthreads(); + + auto token = cuda::device::barrier_arrive_tx(bar, 1, 0); + + bar.wait(cuda::std::move(token)); +} +``` + +[See it on Godbolt](https://godbolt.org/z/1vxcGrT8j){: .btn } + + +[`cuda::thread_scope`]: ./memory_model.md +[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object +[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12 + diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index 3dac578901..c4354ec020 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -67,18 +67,13 @@ class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco> { _LIBCUDACXX_INLINE_VISIBILITY friend void init(barrier * __b, _CUDA_VSTD::ptrdiff_t __expected) { -#if (_LIBCUDACXX_DEBUG_LEVEL >= 2) - _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0); -#endif - + _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); new (__b) barrier(__expected); } _LIBCUDACXX_INLINE_VISIBILITY friend void init(barrier * __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion) { -#if (_LIBCUDACXX_DEBUG_LEVEL >= 2) - _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0); -#endif + _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); new (__b) barrier(__expected, __completion); } }; @@ -182,10 +177,7 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity; _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1) { -#if (_LIBCUDACXX_DEBUG_LEVEL >= 2) - _LIBCUDACXX_DEBUG_ASSERT(__update >= 0); - _LIBCUDACXX_DEBUG_ASSERT(__expected_unit >=0); -#endif + _LIBCUDACXX_DEBUG_ASSERT(__update >= 0, "Arrival count update must be non-negative."); arrival_token __token = {}; NV_DISPATCH_TARGET( NV_PROVIDES_SM_90, ( @@ -561,6 +553,65 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier return reinterpret_cast<_CUDA_VSTD::uint64_t *>(&b.__barrier); } + +// Hide arrive_tx when CUDA architecture is insufficient. Note the +// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function +// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__. +#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) + +_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline +barrier::arrival_token barrier_arrive_tx( + barrier & __b, + _CUDA_VSTD::ptrdiff_t __arrive_count_update, + _CUDA_VSTD::ptrdiff_t __transaction_count_update) { + + _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _LIBCUDACXX_DEBUG_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one."); + _LIBCUDACXX_DEBUG_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1."); + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); + + barrier::arrival_token __token = {}; + NV_IF_TARGET( + // On architectures pre-sm90, arrive_tx is not supported. + NV_PROVIDES_SM_90, ( + // We do not check for the statespace of the barrier here. This is + // on purpose. This allows debugging tools like memcheck/racecheck + // to detect that we are passing a pointer with the wrong state + // space to mbarrier.arrive. If we checked for the state space here, + // and __trap() if wrong, then those tools would not be able to help + // us in release builds. In debug builds, the error would be caught + // by the asserts at the top of this function. + + auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + if (__arrive_count_update == 1) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) + : "memory"); + } else { + asm ( + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) + : "memory"); + asm ( + "mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update)) + : "memory"); + } + ) + ); + return __token; +} +#endif // __CUDA_MINIMUM_ARCH__ + _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier index b235b3db65..2f0afb3d2b 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier @@ -318,9 +318,7 @@ public: auto const __result = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update; auto const __new_expected = __expected.load(memory_order_relaxed); -#if (_LIBCUDACXX_DEBUG_LEVEL >= 2) - _LIBCUDACXX_DEBUG_ASSERT(__result >= 0); -#endif + _LIBCUDACXX_DEBUG_ASSERT(__result >= 0, ""); if(0 == __result) { __completion(); @@ -384,10 +382,11 @@ private: static _LIBCUDACXX_INLINE_VISIBILITY constexpr uint64_t __init(ptrdiff_t __count) noexcept { -#if (_LIBCUDACXX_DEBUG_LEVEL >= 2) - _LIBCUDACXX_DEBUG_ASSERT(__count >= 0); -#endif - +#if _LIBCUDACXX_STD_VER > 11 + // This debug assert is not supported in C++11 due to resulting in a + // multi-statement constexpr function. + _LIBCUDACXX_DEBUG_ASSERT(__count >= 0, "Count must be non-negative."); +#endif // _LIBCUDACXX_STD_VER > 11 return (((1u << 31) - __count) << 32) | ((1u << 31) - __count); } @@ -414,9 +413,7 @@ public: _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion()) : __phase_arrived_expected(__init(__count)) { -#if (_LIBCUDACXX_DEBUG_LEVEL >= 2) - _LIBCUDACXX_DEBUG_ASSERT(__count >= 0); -#endif + _LIBCUDACXX_DEBUG_ASSERT(__count >= 0, ""); } ~__barrier_base() = default; diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version index 5697bfd25e..f52f38ccdc 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/version +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version @@ -219,6 +219,15 @@ __cpp_lib_void_t 201411L #pragma GCC system_header #endif +// We define certain feature test macros depending on availability. When +// __CUDA_MINIMUM_ARCH__ is not available, we define the following features +// unconditionally. This permits checking for the feature in host code. When +// __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the +// hardware supports it. +#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) +# define __cccl_lib_local_barrier_arrive_tx +#endif + // We unconditionally define `__cccl_lib_meow` so that there is only one place to set the value #if _LIBCUDACXX_STD_VER > 11 # define __cccl_lib_chrono_udls 201304L