diff --git a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp new file mode 100644 index 0000000000..6614bfb51f --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include +#include // cuda::std::move +#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS +#include "cuda_space_selector.h" // shared_memory_selector + +// Suppress warning about barrier in shared memory +TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init) + +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900 +static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_async_tx is not available."); +#endif // __CUDA_MINIMUM_ARCH__ + +__device__ alignas(16) int gmem_x[2048]; + + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST,( + //Required by concurrent_agents_launch to know how many we're launching + cuda_thread_count = 512; + )); + + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + using barrier_t = cuda::barrier; + __shared__ alignas(16) int smem_x[1024]; + + shared_memory_selector sel; + barrier_t* b = sel.construct(blockDim.x); + + // Initialize gmem_x + for (int i = threadIdx.x; i < 2048; i += blockDim.x) { + gmem_x[i] = i; + } + __syncthreads(); + + barrier_t::arrival_token token; + if (threadIdx.x == 0) { + auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), *b); + assert(fulfillment == cuda::async_contract_fulfillment::async); + token = cuda::device::barrier_arrive_tx(*b, 1, sizeof(smem_x)); + } else { + token = b->arrive(1); + } + b->wait(cuda::std::move(token)); + + // assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023] + for (int i = threadIdx.x; i < 1024; i += blockDim.x) { + assert(smem_x[i] == i); + } + ) + ); + return 0; +} diff --git a/libcudacxx/docs/extended_api/asynchronous_operations.md b/libcudacxx/docs/extended_api/asynchronous_operations.md index 03c85c2584..18343cae59 100644 --- a/libcudacxx/docs/extended_api/asynchronous_operations.md +++ b/libcudacxx/docs/extended_api/asynchronous_operations.md @@ -1,6 +1,7 @@ ## Asynchronous Operations -| [`cuda::memcpy_async`] | Asynchronously copies one range to another. `(function template)`

1.1.0 / CUDA 11.0
1.2.0 / CUDA 11.1 (group & aligned overloads) | +| [`cuda::memcpy_async`] | Asynchronously copies one range to another. `(function template)`

1.1.0 / CUDA 11.0
1.2.0 / CUDA 11.1 (group & aligned overloads) | +| [`cuda::device_memcpy_async_tx`] | Asynchronously copies one range to another with manual transaction accounting. `(function)` | [`cuda::memcpy_async`]: {{ "extended_api/asynchronous_operations/memcpy_async.html" | relative_url }} diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md new file mode 100644 index 0000000000..a0033dd8df --- /dev/null +++ b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md @@ -0,0 +1,89 @@ +--- +grand_parent: Extended API +parent: Asynchronous Operations +--- + +# `cuda::device::memcpy_async_tx` + +Defined in header ``: + +```cuda +template +inline __device__ +void cuda::device::memcpy_async_tx( + T* dest, + const T* src, + cuda::aligned_size_t size, + cuda::barrier& bar); +``` + +Copies `size` bytes from global memory `src` to shared memory `dest` and decrements the transaction count of `bar` by `size` bytes. + +## Preconditions + +* `src`, `dest` are 16-byte aligned and `size` is a multiple of 16, i.e., + `Alignment >= 16`. +* `dest` points to a shared memory allocation that is at least `size` bytes wide. +* `src` points to a global memory allocation that is at least `size` bytes wide. +* `bar` is located in shared memory +* If either `destination` or `source` is an invalid or null pointer, the + behavior is undefined (even if `count` is zero). + +## Requires + +* `is_trivially_copyable_v` is true. + +## Notes + +This function can only be used under CUDA Compute Capability 9.0 (Hopper) or +higher. + +There is no feature flag to check if `cuda::device::memcpy_async_tx` is +available. + +**Comparison to `cuda::memcpy_async`**: `memcpy_async_tx` supports a subset of +the operations of `memcpy_async`. It gives more control over the synchronization +with a barrier than `memcpy_async`. Currently, `memcpy_async_tx` has no synchronous +fallback mechanism., i.e., it currently does not work on older hardware +(pre-CUDA Compute Capability 9.0, i.e., Hopper). + +## Example + +```cuda +#include +#include // cuda::std::move + +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900 +static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_async_tx is not available."); +#endif // __CUDA_MINIMUM_ARCH__ + +__device__ alignas(16) int gmem_x[2048]; + +__global__ void example_kernel() { + __shared__ alignas(16) int smem_x[1024]; + __shared__ cuda::barrier bar; + if (threadIdx.x == 0) { + init(&bar, blockDim.x); + } + __syncthreads(); + + barrier::arrival_token token; + if (threadIdx.x == 0) { + cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar); + token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x)); + } else { + token = bar.arrive(1); + } + bar.wait(cuda::std::move(token)); + + // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023] + smem_x[threadIdx.x] += 1; +} +``` + +[See it on Godbolt](https://godbolt.org/z/oK7Tazszx){: .btn } + +[`cuda::thread_scope`]: ./memory_model.md +[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object +[`cp.async.bulk` PTX instruction]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk +[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12 diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md index 6a8912a55f..7acfb95d91 100644 --- a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md +++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md @@ -92,7 +92,8 @@ Capability 9). The tx-count of `cuda::barrier` must be set to the total amount of asynchronous memory operations, in units as specified by the asynchronous operations, to be tracked by the current phase. This can be achieved with the -`cuda::device::barrier_arrive_tx` function call. Upon completion of each of the +[`cuda::device::barrier_arrive_tx`](./barrier/barrier_arrive_tx.md) function call. +Upon completion of each of the asynchronous operations, the tx-count of the `cuda::barrier` will be updated and thus progress the `cuda::barrier` towards the completion of the current phase. This may complete the current phase. diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md index cc311ec46f..f29bc3e7e3 100644 --- a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md +++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md @@ -50,6 +50,11 @@ below. ## Example +Below example shows only `cuda::device::barrier_arrive_tx`. A more extensive +example can be found in the +[`cuda::device::memcpy_async_tx`](../../../asynchronous_operations/memcpy_async_tx.md) +documentation. + ```cuda #include #include // cuda::std::move diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index 7727ba7521..da6b09b3e3 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -23,6 +23,8 @@ #pragma GCC system_header #endif +#include "../cstdlib" // _LIBCUDACXX_UNREACHABLE + #if defined(_LIBCUDACXX_COMPILER_NVRTC) #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member)) #else @@ -610,6 +612,46 @@ barrier::arrival_token barrier_arrive_tx( ); return __token; } + +template +_LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( + _Tp* __dest, + const _Tp* __src, + ::cuda::aligned_size_t<_Alignment> __size, + ::cuda::barrier<::cuda::thread_scope_block> & __b) { + // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are + // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with + // memcpy_async when compiling with GCC 4.8. + // FIXME: remove the #if once GCC 4.8 is no longer supported. +#if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 + static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type"); +#endif + static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned."); + + _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory."); + _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory."); + + auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + if (__isShared(__dest) && __isGlobal(__src)) { + asm volatile( + "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))), + "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)) + : "memory"); + } else { + // memcpy_async_tx only supports copying from global to shared + // or from shared to remote cluster dsmem. To copy to remote + // dsmem, we need to arrive on a cluster-scoped barrier, which + // is not yet implemented. So we trap in this case as well. + _LIBCUDACXX_UNREACHABLE(); + } + + return async_contract_fulfillment::async; +} #endif // __CUDA_MINIMUM_ARCH__ _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE