NVIDIA · miscco · Oct 13, 2023 · Sep 5, 2023 · Sep 5, 2023 · Sep 19, 2023
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include <cuda/barrier>
+#include <cuda/std/utility>      // cuda::std::move
+#include "test_macros.h"         // TEST_NV_DIAG_SUPPRESS
+#include "cuda_space_selector.h" // shared_memory_selector
+
+// Suppress warning about barrier in shared memory
+TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
+
+#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
+static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_async_tx is not available.");
+#endif // __CUDA_MINIMUM_ARCH__
+
+__device__ alignas(16) int gmem_x[2048];
+
+
+int main(int, char**)
+{
+    NV_IF_TARGET(NV_IS_HOST,(
+        //Required by concurrent_agents_launch to know how many we're launching
+        cuda_thread_count = 512;
+    ));
+
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+            __shared__ alignas(16) int smem_x[1024];
+
+            shared_memory_selector<barrier_t, constructor_initializer> sel;
+            barrier_t* b = sel.construct(blockDim.x);
+
+            // Initialize gmem_x
+            for (int i = threadIdx.x; i < 2048; i += blockDim.x) {
+                gmem_x[i] = i;
+            }
+            __syncthreads();
+
+            barrier_t::arrival_token token;
+            if (threadIdx.x == 0) {
+                auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), *b);
+                assert(fulfillment == cuda::async_contract_fulfillment::async);
+                token = cuda::device::barrier_arrive_tx(*b, 1, sizeof(smem_x));
+            } else {
+                token = b->arrive(1);
+            }
+            b->wait(cuda::std::move(token));
+
+            // assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
+            for (int i = threadIdx.x; i < 1024; i += blockDim.x) {
+                assert(smem_x[i] == i);
+            }
+        )
+    );
+    return 0;
+}
@@ -1,6 +1,7 @@
 ## Asynchronous Operations
 
-| [`cuda::memcpy_async`] | Asynchronously copies one range to another. `(function template)` <br/><br/> 1.1.0 / CUDA 11.0 <br/> 1.2.0 / CUDA 11.1 (group & aligned overloads) |
+| [`cuda::memcpy_async`]           | Asynchronously copies one range to another. `(function template)` <br/><br/> 1.1.0 / CUDA 11.0 <br/> 1.2.0 / CUDA 11.1 (group & aligned overloads) |
+| [`cuda::device_memcpy_async_tx`] | Asynchronously copies one range to another with manual transaction accounting. `(function)`                                                        |
 
 
 [`cuda::memcpy_async`]: {{ "extended_api/asynchronous_operations/memcpy_async.html" | relative_url }}

@@ -0,0 +1,100 @@
+---
+grand_parent: Extended API
+parent: Asynchronous Operations
+---
+
+# `cuda::device::memcpy_async_tx`
+
+Defined in header `<cuda/barrier>`:
+
+```cuda
+template <typename T, size_t Alignment>
+inline __device__
+async_contract_fulfillment
+cuda::device::memcpy_async_tx(
+  T* dest,
+  const T* src,
+  cuda::aligned_size_t<Alignment> size,
+  cuda::barrier<cuda::thread_scope_block>& bar);
+```
+
+Copies `size` bytes from global memory `src` to shared memory `dest` and decrements the transaction count of `bar` by `size` bytes.
+
+## Preconditions
+
+* `src`, `dest` are 16-byte aligned and `size` is a multiple of 16, i.e.,
+  `Alignment >= 16`.
+* `dest` points to a shared memory allocation that is at least `size` bytes wide.
+* `src` points to a global memory allocation that is at least `size` bytes wide.
+* `bar` is located in shared memory
+* If either `destination` or `source` is an invalid or null pointer, the
+    behavior is undefined (even if `count` is zero).
+
+## Requires
+
+* `is_trivially_copyable_v<T>` is true.
+
+## Notes
+
+This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
+higher.
+
+There is no feature flag to check if `cuda::device::memcpy_async_tx` is
+available.
+
+**Comparison to `cuda::memcpy_async`**: `memcpy_async_tx` supports a subset of
+the operations of `memcpy_async`. It gives more control over the synchronization
+with a barrier than `memcpy_async`. Currently, `memcpy_async_tx` has no synchronous
+fallback mechanism., i.e., it currently does not work on older hardware
+(pre-CUDA Compute Capability 9.0, i.e., Hopper).
+
+## Return Value
+
+Returns `async_contract_fulfillment::async`.
+
+## Example
+
+```cuda
+#include <cuda/barrier>
+#include <cuda/std/utility> // cuda::std::move
+
+#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
+static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_async_tx is not available.");
+#endif // __CUDA_MINIMUM_ARCH__
+
+__device__ alignas(16) int gmem_x[2048];
+
+__global__ void example_kernel() {
+  __shared__ alignas(16) int smem_x[1024];
+  __shared__ cuda::barrier<cuda::thread_scope_block> bar;
+  if (threadIdx.x == 0) {
+    init(&bar, blockDim.x);
+  }
+  __syncthreads();
+
+  barrier::arrival_token token;
+  if (threadIdx.x == 0) {
+    cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
+    token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
+  } else {
+    token = bar.arrive(1);
+  } 
+  bar.wait(cuda::std::move(token));
+
+  // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
+  smem_x[threadIdx.x] += 1;
+}
+```
+
+[See it on Godbolt](https://godbolt.org/z/oK7Tazszx){: .btn }
+
+
+[`cuda::thread_scope`]: ./memory_model.md
+[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object
+
+[`cp.async.bulk` PTX instruction]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+
+[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12
+
+
+
@@ -92,7 +92,8 @@ Capability 9).
 The tx-count of `cuda::barrier` must be set to the total amount of asynchronous
 memory operations, in units as specified by the asynchronous operations, to be
 tracked by the current phase. This can be achieved with the
-`cuda::device::barrier_arrive_tx` function call. Upon completion of each of the
+[`cuda::device::barrier_arrive_tx`](./barrier/barrier_arrive_tx.md) function call.
+Upon completion of each of the
 asynchronous operations, the tx-count of the `cuda::barrier` will be updated and
 thus progress the `cuda::barrier` towards the completion of the current phase.
 This may complete the current phase.

@@ -50,6 +50,11 @@ below.
 
 ## Example
 
+Below example shows only `cuda::device::barrier_arrive_tx`. A more extensive
+example can be found in the
+[`cuda::device::memcpy_async_tx`](../../../asynchronous_operations/memcpy_async_tx.md)
+documentation.
+
 ```cuda
 #include <cuda/barrier>
 #include <cuda/std/utility> // cuda::std::move

@@ -23,6 +23,8 @@
 #pragma GCC system_header
 #endif
 
+#include "../cstdlib"           // _LIBCUDACXX_UNREACHABLE
+
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
 #else
@@ -610,6 +612,46 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     );
     return __token;
 }
+
+template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
+_LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
+    _Tp* __dest,
+    const _Tp* __src,
+    ::cuda::aligned_size_t<_Alignment> __size,
+    ::cuda::barrier<::cuda::thread_scope_block> & __b) {
+    // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
+    // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
+    // memcpy_async when compiling with GCC 4.8.
+    // FIXME: remove the #if once GCC 4.8 is no longer supported.
+#if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408
+    static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type");
+#endif
+    static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
+
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
+
+    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+    if (__isShared(__dest) && __isGlobal(__src)) {
+        asm volatile(
+            "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
+            :
+            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
+              "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
+              "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
+              "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh))
+            : "memory");
+    } else {
+        // memcpy_async_tx only supports copying from global to shared
+        // or from shared to remote cluster dsmem. To copy to remote
+        // dsmem, we need to arrive on a cluster-scoped barrier, which
+        // is not yet implemented. So we trap in this case as well.
+        _LIBCUDACXX_UNREACHABLE();
+    }
+
+    return async_contract_fulfillment::async;
+}
 #endif // __CUDA_MINIMUM_ARCH__
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE