From 71a6b2957ae2446b715a67999e9125ebebbf5295 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 5 Sep 2023 20:40:00 +0200
Subject: [PATCH 01/18] Implement cuda::device::memcpy_async_tx

---
 .../detail/libcxx/include/__cuda/barrier.h    | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index c4354ec020..39c64dc1fd 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -610,6 +610,42 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     );
     return __token;
 }
+
+template <typename T, _CUDA_VSTD::size_t Alignment>
+_LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
+    T* __dest,
+    const T* __src,
+    ::cuda::aligned_size_t<Alignment> __size,
+    ::cuda::barrier<::cuda::thread_scope_block> & __b) {
+    static_assert(16 <= Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
+
+    NV_DISPATCH_TARGET(
+        NV_PROVIDES_SM_90, (
+            if (__isShared(__dest) && __isGlobal(__src) && __isShared(barrier_native_handle(__b))) {
+                asm volatile(
+                    "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
+                    :
+                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
+                      "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
+                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
+                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__b))))
+                    : "memory");
+
+            } else {
+                // memcpy_async_tx only supports copying from global to shared
+                // or from shared to remote cluster dsmem. To copy to remote
+                // dsmem, we need to arrive on a cluster-scoped barrier, which
+                // is not yet implemented. So we trap in this case as well.
+                __trap();
+            }
+        ), NV_ANY_TARGET, (
+            // On architectures pre-SM90 (and in host code), arriving with a
+            // transaction count update is not supported and we trap.
+            __trap();
+        )
+    );
+    return async_contract_fulfillment::async;
+}
 #endif // __CUDA_MINIMUM_ARCH__
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE

From c9ec01992a48ac368e6f9900a0246eef360c3638 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 5 Sep 2023 21:01:32 +0200
Subject: [PATCH 02/18] Add documentation

---
 .../barrier/memcpy_async_tx.md                | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 libcudacxx/docs/extended_api/synchronization_primitives/barrier/memcpy_async_tx.md

diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/memcpy_async_tx.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/memcpy_async_tx.md
new file mode 100644
index 0000000000..e86668f4c3
--- /dev/null
+++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/memcpy_async_tx.md
@@ -0,0 +1,87 @@
+---
+grand_parent: Extended API
+parent: Barriers
+---
+
+# `cuda::device::memcpy_async_tx`
+
+Defined in header `<cuda/barrier>`:
+
+```cuda
+template <typename T, size_t Alignment>
+inline __device__
+async_contract_fulfillment
+cuda::device::memcpy_async_tx(
+  T* dest,
+  const T* src,
+  cuda::aligned_size_t<Alignment> size,
+  cuda::barrier<cuda::thread_scope_block>& bar);
+```
+
+Copies `size` bytes from global memory `src` to shared memory `dest` and arrives
+on a shared memory barrier `bar`, updating its transaction count by `size`
+bytes.
+ 
+## Notes
+
+The behavior is undefined if any of the following conditions hold: 
+- `bar` is not in `__shared__` memory;
+- `dest` is not 16-byte aligned
+- `src` is not 16-byte aligned
+- `size` is not a multiple of 16.
+
+This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
+higher.
+
+There is no feature flag to check if `cuda::device::memcpy_async_tx` is
+available.
+
+## Return Value
+
+Returns `async_contract_fulfillment::async`.
+
+## Example
+
+```cuda
+#include <cuda/barrier>
+#include <cuda/std/utility> // cuda::std::move
+
+#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
+static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_async_tx is not available.");
+#endif // __CUDA_MINIMUM_ARCH__
+
+__device__ alignas(16) int gmem_x[2048];
+
+__global__ void example_kernel() {
+  __shared__ alignas(16) int smem_x[1024];
+  __shared__ cuda::barrier<cuda::thread_scope_block> bar;
+  if (threadIdx.x == 0) {
+    init(&bar, blockDim.x);
+  }
+
+  barrier::arrival_token token;
+  if (threadIdx.x == 0) {
+    cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
+    token = cuda::device::arrive_tx(bar, 1, sizeof(smem_x));
+  } else {
+    auto token = bar.arrive(1);
+  } 
+  bar.wait(cuda::std::move(token));
+
+  // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
+  smem_x[threadIdx.x] += 1;
+}
+```
+
+[See it on Godbolt](https://godbolt.org/z/nTv558sK7){: .btn }
+
+
+[`cuda::thread_scope`]: ./memory_model.md
+[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object
+
+[`cp.async.bulk` PTX instruction]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+
+[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12
+
+
+

From 4b7d827151e36715f986377a26027f3e20516760 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 19 Sep 2023 16:40:42 +0200
Subject: [PATCH 03/18] Move docs

---
 libcudacxx/docs/extended_api/asynchronous_operations.md        | 3 ++-
 .../barrier => asynchronous_operations}/memcpy_async_tx.md     | 0
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename libcudacxx/docs/extended_api/{synchronization_primitives/barrier => asynchronous_operations}/memcpy_async_tx.md (100%)

diff --git a/libcudacxx/docs/extended_api/asynchronous_operations.md b/libcudacxx/docs/extended_api/asynchronous_operations.md
index 03c85c2584..18343cae59 100644
--- a/libcudacxx/docs/extended_api/asynchronous_operations.md
+++ b/libcudacxx/docs/extended_api/asynchronous_operations.md
@@ -1,6 +1,7 @@
 ## Asynchronous Operations
 
-| [`cuda::memcpy_async`] | Asynchronously copies one range to another. `(function template)` <br/><br/> 1.1.0 / CUDA 11.0 <br/> 1.2.0 / CUDA 11.1 (group & aligned overloads) |
+| [`cuda::memcpy_async`]           | Asynchronously copies one range to another. `(function template)` <br/><br/> 1.1.0 / CUDA 11.0 <br/> 1.2.0 / CUDA 11.1 (group & aligned overloads) |
+| [`cuda::device_memcpy_async_tx`] | Asynchronously copies one range to another with manual transaction accounting. `(function)`                                                        |
 
 
 [`cuda::memcpy_async`]: {{ "extended_api/asynchronous_operations/memcpy_async.html" | relative_url }}
diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
similarity index 100%
rename from libcudacxx/docs/extended_api/synchronization_primitives/barrier/memcpy_async_tx.md
rename to libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md

From 2cc403b821a4eb37da1b4d6b10179a866393331a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 19 Sep 2023 16:41:54 +0200
Subject: [PATCH 04/18] Update docs

---
 .../memcpy_async_tx.md                        | 29 ++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
index e86668f4c3..b9b85ef66d 100644
--- a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
+++ b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
@@ -1,6 +1,6 @@
 ---
 grand_parent: Extended API
-parent: Barriers
+parent: Asynchronous Operations
 ---
 
 # `cuda::device::memcpy_async_tx`
@@ -21,21 +21,36 @@ cuda::device::memcpy_async_tx(
 Copies `size` bytes from global memory `src` to shared memory `dest` and arrives
 on a shared memory barrier `bar`, updating its transaction count by `size`
 bytes.
+
+## Preconditions
+
+* `src`, `dest` are 16-byte aligned and `size` is a multiple of 16, i.e.,
+  `Alignment >= 16`.
+* `dest` points to shared memory
+* `src` points to global memory
+* `bar` is located in shared memory
+* If either `destination` or `source` is an invalid or null pointer, the
+    behavior is undefined (even if `count` is zero).
+* If the objects are [potentially-overlapping] the behavior is undefined.
+* If the objects are not of [_TriviallyCopyable_] type the program is
+    ill-formed, no diagnostic required.
+
  
 ## Notes
 
-The behavior is undefined if any of the following conditions hold: 
-- `bar` is not in `__shared__` memory;
-- `dest` is not 16-byte aligned
-- `src` is not 16-byte aligned
-- `size` is not a multiple of 16.
-
 This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
 higher.
 
 There is no feature flag to check if `cuda::device::memcpy_async_tx` is
 available.
 
+**Comparison to `cuda::memcpy_async`**: `memcpy_async_tx` supports a subset of
+the operations of `memcpy_async`. It gives more control over the synchronization
+with a barrier than `memcpy_async`. `memcpy_async_tx` has no synchronous
+fallback mechanism, so it can be used to ensure that the newest hardware
+features are used. The drawback is that it does not work on older hardware
+(pre-CUDA Compute Capability 9.0, i.e., Hopper).
+
 ## Return Value
 
 Returns `async_contract_fulfillment::async`.

From 014f83d07c971a494a3f23e1deff481660a71855 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 19 Sep 2023 16:42:10 +0200
Subject: [PATCH 05/18] Add test

---
 .../memcpy_async/memcpy_async_tx.pass.cpp     | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp

diff --git a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
new file mode 100644
index 0000000000..35d1a20142
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include <cuda/barrier>
+#include <cuda/std/utility> // cuda::std::move
+#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
+
+// Suppress warning about barrier in shared memory
+TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
+
+#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
+static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_async_tx is not available.");
+#endif // __CUDA_MINIMUM_ARCH__
+
+__device__ alignas(16) int gmem_x[2048];
+
+
+int main(int, char**)
+{
+    NV_IF_TARGET(NV_IS_HOST,(
+        //Required by concurrent_agents_launch to know how many we're launching
+        cuda_thread_count = 512;
+    ));
+
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+            __shared__ alignas(16) int smem_x[1024];
+            __shared__ barrier_t bar;
+            if (threadIdx.x == 0) {
+                init(&bar, blockDim.x);
+            }
+
+            barrier_t::arrival_token token;
+            if (threadIdx.x == 0) {
+                cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
+                token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
+            } else {
+                auto token = bar.arrive(1);
+            }
+            bar.wait(cuda::std::move(token));
+
+            // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
+            smem_x[threadIdx.x] += 1;
+        )
+    );
+    return 0;
+}

From e0ca3f13ded9b17f1a277ef1f98ccced57b8ccbe Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 19 Sep 2023 16:42:19 +0200
Subject: [PATCH 06/18] Uglify parameters

---
 .../std/detail/libcxx/include/__cuda/barrier.h    | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 39c64dc1fd..6a39103479 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -611,24 +611,25 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     return __token;
 }
 
-template <typename T, _CUDA_VSTD::size_t Alignment>
+template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
 _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
-    T* __dest,
-    const T* __src,
-    ::cuda::aligned_size_t<Alignment> __size,
+    _Tp* __dest,
+    const _Tp* __src,
+    ::cuda::aligned_size_t<_Alignment> __size,
     ::cuda::barrier<::cuda::thread_scope_block> & __b) {
-    static_assert(16 <= Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
+    static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
 
     NV_DISPATCH_TARGET(
         NV_PROVIDES_SM_90, (
-            if (__isShared(__dest) && __isGlobal(__src) && __isShared(barrier_native_handle(__b))) {
+            auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+            if (__isShared(__dest) && __isGlobal(__src)) {
                 asm volatile(
                     "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
                     :
                     : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
                       "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__b))))
+                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh))
                     : "memory");
 
             } else {

From a75fa1455987a8a3a3a61b33195bd907c1d3cced Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 19 Sep 2023 16:46:12 +0200
Subject: [PATCH 07/18] Fix code example

---
 .../extended_api/asynchronous_operations/memcpy_async_tx.md  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
index b9b85ef66d..96d260792c 100644
--- a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
+++ b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
@@ -73,11 +73,12 @@ __global__ void example_kernel() {
   if (threadIdx.x == 0) {
     init(&bar, blockDim.x);
   }
+  __syncthreads();
 
   barrier::arrival_token token;
   if (threadIdx.x == 0) {
     cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
-    token = cuda::device::arrive_tx(bar, 1, sizeof(smem_x));
+    token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
   } else {
     auto token = bar.arrive(1);
   } 
@@ -88,7 +89,7 @@ __global__ void example_kernel() {
 }
 ```
 
-[See it on Godbolt](https://godbolt.org/z/nTv558sK7){: .btn }
+[See it on Godbolt](https://godbolt.org/z/oK7Tazszx){: .btn }
 
 
 [`cuda::thread_scope`]: ./memory_model.md

From 13e3c167d847ce14b40824dde84d8612f0cb16f3 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 19 Sep 2023 17:06:21 +0200
Subject: [PATCH 08/18] Assert preconditions

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 6a39103479..85e300e6d6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -619,6 +619,10 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     ::cuda::barrier<::cuda::thread_scope_block> & __b) {
     static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
 
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
+
     NV_DISPATCH_TARGET(
         NV_PROVIDES_SM_90, (
             auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));

From c42c28c4f09fb3a8693dfb1d27deae68e43a2eb2 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 29 Sep 2023 12:07:22 +0200
Subject: [PATCH 09/18] Fix memcpy_async_tx docs

---
 .../asynchronous_operations/memcpy_async_tx.md            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
index 96d260792c..e40c6ebf36 100644
--- a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
+++ b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
@@ -31,10 +31,10 @@ bytes.
 * `bar` is located in shared memory
 * If either `destination` or `source` is an invalid or null pointer, the
     behavior is undefined (even if `count` is zero).
-* If the objects are [potentially-overlapping] the behavior is undefined.
-* If the objects are not of [_TriviallyCopyable_] type the program is
-    ill-formed, no diagnostic required.
 
+## Requires
+
+* `is_trivially_copyable_v<T>` is true.
  
 ## Notes
 
@@ -80,7 +80,7 @@ __global__ void example_kernel() {
     cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
     token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
   } else {
-    auto token = bar.arrive(1);
+    token = bar.arrive(1);
   } 
   bar.wait(cuda::std::move(token));
 

From 759132e973d6380d9d3b9f2eb733e8c971f6111a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 29 Sep 2023 12:07:44 +0200
Subject: [PATCH 10/18] Add static_assert to memcpy_async_tx

---
 .../cuda/std/detail/libcxx/include/__cuda/barrier.h       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 85e300e6d6..dc2caad805 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -617,6 +617,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     const _Tp* __src,
     ::cuda::aligned_size_t<_Alignment> __size,
     ::cuda::barrier<::cuda::thread_scope_block> & __b) {
+    // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
+    // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
+    // memcpy_async when compiling with GCC 4.8.
+    // FIXME: remove the #if once GCC 4.8 is no longer supported.
+#if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408
+    static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type");
+#endif
     static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
 
     _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
@@ -635,7 +642,6 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh))
                     : "memory");
-
             } else {
                 // memcpy_async_tx only supports copying from global to shared
                 // or from shared to remote cluster dsmem. To copy to remote

From fc20864ef259432ff04a7e1dcbbd8958fe09efe7 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 29 Sep 2023 12:08:12 +0200
Subject: [PATCH 11/18] Add example link in barrier_arrive_tx docs

---
 .../synchronization_primitives/barrier/barrier_arrive_tx.md  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md
index cc311ec46f..f29bc3e7e3 100644
--- a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md
+++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md
@@ -50,6 +50,11 @@ below.
 
 ## Example
 
+Below example shows only `cuda::device::barrier_arrive_tx`. A more extensive
+example can be found in the
+[`cuda::device::memcpy_async_tx`](../../../asynchronous_operations/memcpy_async_tx.md)
+documentation.
+
 ```cuda
 #include <cuda/barrier>
 #include <cuda/std/utility> // cuda::std::move

From f81c97e253d802fc418f94c745b4f6e27e7347e7 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 29 Sep 2023 12:08:57 +0200
Subject: [PATCH 12/18] Fix memcpy_async_tx test

---
 .../cuda/memcpy_async/memcpy_async_tx.pass.cpp  | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
index 35d1a20142..fd332d744d 100644
--- a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -43,17 +43,26 @@ int main(int, char**)
                 init(&bar, blockDim.x);
             }
 
+            // Initialize gmem_x
+            for (int i = threadIdx.x; i < 2048; i += blockDim.x) {
+                gmem_x[i] = i;
+            }
+            __syncthreads();
+
             barrier_t::arrival_token token;
             if (threadIdx.x == 0) {
-                cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
+                auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
+                assert(fulfillment == cuda::async_contract_fulfillment::async);
                 token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
             } else {
-                auto token = bar.arrive(1);
+                token = bar.arrive(1);
             }
             bar.wait(cuda::std::move(token));
 
-            // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
-            smem_x[threadIdx.x] += 1;
+            // assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
+            for (int i = threadIdx.x; i < 1024; i += blockDim.x) {
+                assert(smem_x[i] == i);
+            }
         )
     );
     return 0;

From 043e11700a6981e8a8f9e4c5809177e1f5685968 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Fri, 29 Sep 2023 12:09:14 +0200
Subject: [PATCH 13/18] Apply suggestions from code review

Co-authored-by: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
---
 .../asynchronous_operations/memcpy_async_tx.md      | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
index e40c6ebf36..edf42bec0b 100644
--- a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
+++ b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
@@ -18,16 +18,14 @@ cuda::device::memcpy_async_tx(
   cuda::barrier<cuda::thread_scope_block>& bar);
 ```
 
-Copies `size` bytes from global memory `src` to shared memory `dest` and arrives
-on a shared memory barrier `bar`, updating its transaction count by `size`
-bytes.
+Copies `size` bytes from global memory `src` to shared memory `dest` and decrements the transaction count of `bar` by `size` bytes.
 
 ## Preconditions
 
 * `src`, `dest` are 16-byte aligned and `size` is a multiple of 16, i.e.,
   `Alignment >= 16`.
-* `dest` points to shared memory
-* `src` points to global memory
+* `dest` points to a shared memory allocation that is at least `size` bytes wide.
+* `src` points to a global memory allocation that is at least `size` bytes wide.
 * `bar` is located in shared memory
 * If either `destination` or `source` is an invalid or null pointer, the
     behavior is undefined (even if `count` is zero).
@@ -46,9 +44,8 @@ available.
 
 **Comparison to `cuda::memcpy_async`**: `memcpy_async_tx` supports a subset of
 the operations of `memcpy_async`. It gives more control over the synchronization
-with a barrier than `memcpy_async`. `memcpy_async_tx` has no synchronous
-fallback mechanism, so it can be used to ensure that the newest hardware
-features are used. The drawback is that it does not work on older hardware
+with a barrier than `memcpy_async`. Currently, `memcpy_async_tx` has no synchronous
+fallback mechanism., i.e., it currently does not work on older hardware
 (pre-CUDA Compute Capability 9.0, i.e., Hopper).
 
 ## Return Value

From 760c5925f1e2a35bf9d6da8c3b193a669c658d09 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 2 Oct 2023 20:25:34 +0200
Subject: [PATCH 14/18] Link to barrier_arrive_tx

---
 .../docs/extended_api/synchronization_primitives/barrier.md    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md
index 6a8912a55f..7acfb95d91 100644
--- a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md
+++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md
@@ -92,7 +92,8 @@ Capability 9).
 The tx-count of `cuda::barrier` must be set to the total amount of asynchronous
 memory operations, in units as specified by the asynchronous operations, to be
 tracked by the current phase. This can be achieved with the
-`cuda::device::barrier_arrive_tx` function call. Upon completion of each of the
+[`cuda::device::barrier_arrive_tx`](./barrier/barrier_arrive_tx.md) function call.
+Upon completion of each of the
 asynchronous operations, the tx-count of the `cuda::barrier` will be updated and
 thus progress the `cuda::barrier` towards the completion of the current phase.
 This may complete the current phase.

From 1e52fdb02fccb83c1ce3412344b8bc05714b9510 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 2 Oct 2023 20:41:49 +0200
Subject: [PATCH 15/18] Add _LIBCUDACXX_UNREACHABLE

---
 .../detail/libcxx/include/__cuda/barrier.h    | 45 +++++++++----------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index dc2caad805..4af49c9ef5 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -23,6 +23,8 @@
 #pragma GCC system_header
 #endif
 
+#include "../cstdlib"           // _LIBCUDACXX_UNREACHABLE
+
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
 #else
@@ -630,31 +632,24 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
     _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
 
-    NV_DISPATCH_TARGET(
-        NV_PROVIDES_SM_90, (
-            auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
-            if (__isShared(__dest) && __isGlobal(__src)) {
-                asm volatile(
-                    "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
-                    :
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-                      "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh))
-                    : "memory");
-            } else {
-                // memcpy_async_tx only supports copying from global to shared
-                // or from shared to remote cluster dsmem. To copy to remote
-                // dsmem, we need to arrive on a cluster-scoped barrier, which
-                // is not yet implemented. So we trap in this case as well.
-                __trap();
-            }
-        ), NV_ANY_TARGET, (
-            // On architectures pre-SM90 (and in host code), arriving with a
-            // transaction count update is not supported and we trap.
-            __trap();
-        )
-    );
+    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+    if (__isShared(__dest) && __isGlobal(__src)) {
+        asm volatile(
+            "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
+            :
+            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
+              "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
+              "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
+              "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh))
+            : "memory");
+    } else {
+        // memcpy_async_tx only supports copying from global to shared
+        // or from shared to remote cluster dsmem. To copy to remote
+        // dsmem, we need to arrive on a cluster-scoped barrier, which
+        // is not yet implemented. So we trap in this case as well.
+        _LIBCUDACXX_UNREACHABLE();
+    }
+
     return async_contract_fulfillment::async;
 }
 #endif // __CUDA_MINIMUM_ARCH__

From b8866855dae9f11a9702649efa55a7b471494051 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 2 Oct 2023 20:42:11 +0200
Subject: [PATCH 16/18] Use shared_memory_selector

---
 .../memcpy_async/memcpy_async_tx.pass.cpp     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
index fd332d744d..6614bfb51f 100644
--- a/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -14,8 +14,9 @@
 // <cuda/barrier>
 
 #include <cuda/barrier>
-#include <cuda/std/utility> // cuda::std::move
-#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
+#include <cuda/std/utility>      // cuda::std::move
+#include "test_macros.h"         // TEST_NV_DIAG_SUPPRESS
+#include "cuda_space_selector.h" // shared_memory_selector
 
 // Suppress warning about barrier in shared memory
 TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
@@ -38,10 +39,9 @@ int main(int, char**)
         NV_IS_DEVICE, (
             using barrier_t = cuda::barrier<cuda::thread_scope_block>;
             __shared__ alignas(16) int smem_x[1024];
-            __shared__ barrier_t bar;
-            if (threadIdx.x == 0) {
-                init(&bar, blockDim.x);
-            }
+
+            shared_memory_selector<barrier_t, constructor_initializer> sel;
+            barrier_t* b = sel.construct(blockDim.x);
 
             // Initialize gmem_x
             for (int i = threadIdx.x; i < 2048; i += blockDim.x) {
@@ -51,13 +51,13 @@ int main(int, char**)
 
             barrier_t::arrival_token token;
             if (threadIdx.x == 0) {
-                auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
+                auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), *b);
                 assert(fulfillment == cuda::async_contract_fulfillment::async);
-                token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
+                token = cuda::device::barrier_arrive_tx(*b, 1, sizeof(smem_x));
             } else {
-                token = bar.arrive(1);
+                token = b->arrive(1);
             }
-            bar.wait(cuda::std::move(token));
+            b->wait(cuda::std::move(token));
 
             // assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
             for (int i = threadIdx.x; i < 1024; i += blockDim.x) {

From 0de28d61a2a0c98d2caa0db18f6f1390aa0b873e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Thu, 12 Oct 2023 12:11:43 +0200
Subject: [PATCH 17/18] Update
 libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md

Co-authored-by: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
---
 .../extended_api/asynchronous_operations/memcpy_async_tx.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
index edf42bec0b..5963e0db44 100644
--- a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
+++ b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
@@ -88,13 +88,7 @@ __global__ void example_kernel() {
 
 [See it on Godbolt](https://godbolt.org/z/oK7Tazszx){: .btn }
 
-
 [`cuda::thread_scope`]: ./memory_model.md
 [Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object
-
 [`cp.async.bulk` PTX instruction]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-
 [thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12
-
-
-

From c99ab075cf64b02ce067d78c37fb9504dfac559a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 12 Oct 2023 21:17:29 +0200
Subject: [PATCH 18/18] Document void return type for consistency

---
 .../asynchronous_operations/memcpy_async_tx.md             | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
index 5963e0db44..a0033dd8df 100644
--- a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
+++ b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
@@ -10,8 +10,7 @@ Defined in header `<cuda/barrier>`:
 ```cuda
 template <typename T, size_t Alignment>
 inline __device__
-async_contract_fulfillment
-cuda::device::memcpy_async_tx(
+void cuda::device::memcpy_async_tx(
   T* dest,
   const T* src,
   cuda::aligned_size_t<Alignment> size,
@@ -48,10 +47,6 @@ with a barrier than `memcpy_async`. Currently, `memcpy_async_tx` has no synchron
 fallback mechanism., i.e., it currently does not work on older hardware
 (pre-CUDA Compute Capability 9.0, i.e., Hopper).
 
-## Return Value
-
-Returns `async_contract_fulfillment::async`.
-
 ## Example
 
 ```cuda