diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h
new file mode 100644
index 0000000000..9fcc2130dd
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TEST_ARRIVE_TX_H_
+#define TEST_ARRIVE_TX_H_
+
+#include <cuda/barrier>
+
+#include <cuda/std/utility>
+
+#include "concurrent_agents.h"
+#include "cuda_space_selector.h"
+#include "test_macros.h"
+
+// Suppress warning about barrier in shared memory
+TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
+
+template<typename Barrier>
+inline __device__
+void mbarrier_complete_tx(
+  Barrier &b, int transaction_count)
+{
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_90, (
+        if (__isShared(cuda::device::barrier_native_handle(b))) {
+            asm volatile(
+              "mbarrier.complete_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+              :
+              : "r"((unsigned int) __cvta_generic_to_shared(cuda::device::barrier_native_handle(b))),
+                "r"(transaction_count)
+              : "memory");
+        } else {
+          __trap();
+        }
+    ), NV_ANY_TARGET, (
+      // On architectures pre-SM90 (and on host), we drop the transaction count
+      // update. The barriers do not keep track of transaction counts.
+      __trap();
+    )
+  );
+}
+
+template<typename Barrier>
+__device__
+void thread(Barrier& b, int arrives_per_thread)
+{
+  constexpr int tx_count = 1;
+  auto tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count);
+  // Manually increase the transaction count of the barrier.
+  mbarrier_complete_tx(b, tx_count);
+
+  b.wait(cuda::std::move(tok));
+}
+
+__device__
+void test()
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE, (
+      // Run all threads, each arriving with arrival count 1
+      constexpr auto block = cuda::thread_scope_block;
+
+      __shared__ cuda::barrier<block> bar_1;
+      init(&bar_1, (int) blockDim.x);
+      __syncthreads();
+      thread(bar_1, 1);
+
+      // Run all threads, each arriving with arrival count 2
+      __shared__ cuda::barrier<block> bar_2;
+      init(&bar_2, (int) 2 * blockDim.x);
+      __syncthreads();
+      thread(bar_2, 2);
+    )
+  );
+}
+
+#endif // TEST_ARRIVE_TX_H_
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cluster.runfail.cpp
new file mode 100644
index 0000000000..8f82188e41
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cluster.runfail.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include <cooperative_groups.h>
+#include <cuda/barrier>
+#include "test_macros.h"
+
+// Suppress warning about barrier in shared memory
+TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
+
+int main(int, char**){
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+            // When PR #416 is merged, uncomment this line:
+            // cuda_cluster_size = 2;
+        ),
+        NV_IS_DEVICE, (
+            __shared__ cuda::barrier<cuda::thread_scope_block> bar;
+
+            if (threadIdx.x == 0) {
+                init(&bar, blockDim.x);
+            }
+            namespace cg = cooperative_groups;
+            auto cluster = cg::this_cluster();
+
+            cluster.sync();
+
+            // This test currently fails at this point because support for
+            // clusters has not yet been added.
+            cuda::barrier<cuda::thread_scope_block> *remote_bar;
+            remote_bar = cluster.map_shared_rank(&bar, cluster.block_rank() ^ 1);
+
+            // When PR #416 is merged, this should fail here because the barrier
+            // is in device memory.
+            auto token = cuda::device::barrier_arrive_tx(*remote_bar, 1, 0);
+    ));
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp
new file mode 100644
index 0000000000..9c67e23c6f
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "arrive_tx.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        // Required by concurrent_agents_launch to know how many we're
+        // launching. This can only be an int, because the nvrtc tests use grep
+        // to figure out how many threads to launch.
+        cuda_thread_count = 256;
+        ),
+        NV_IS_DEVICE, (
+            test();
+        )
+    );
+
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_device.runfail.cpp
new file mode 100644
index 0000000000..bcf6dcf622
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_device.runfail.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include <cuda/barrier>
+#include "test_macros.h"
+
+// Suppress warning about barrier in shared memory
+TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
+
+__device__ uint64_t bar_storage;
+
+int main(int, char**){
+    NV_IF_TARGET(
+        NV_IS_DEVICE, (
+            cuda::barrier<cuda::thread_scope_block> *bar_ptr;
+            bar_ptr = reinterpret_cast<cuda::barrier<cuda::thread_scope_block> *>(bar_storage);
+
+            if (threadIdx.x == 0) {
+                init(bar_ptr, blockDim.x);
+            }
+            __syncthreads();
+
+            // Should fail because the barrier is in device memory.
+            auto token = cuda::device::barrier_arrive_tx(*bar_ptr, 1, 0);
+    ));
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_feature_test.pass.cpp
new file mode 100644
index 0000000000..1a40fcb78a
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_feature_test.pass.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include <cuda/barrier>
+
+#ifndef  __cccl_lib_local_barrier_arrive_tx
+static_assert(false, "should define __cccl_lib_local_barrier_arrive_tx");
+#endif // __cccl_lib_local_barrier_arrive_tx
+
+int main(int, char**){
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
new file mode 100644
index 0000000000..d0102acec6
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <cuda/barrier>
+
+#include <cuda/barrier>
+
+int main(int, char**){
+    NV_IF_TARGET(
+        NV_IS_DEVICE, (
+            __shared__ cuda::barrier<cuda::thread_scope_block> bar;
+            if (threadIdx.x == 0) {
+                init(&bar, blockDim.x);
+            }
+            __syncthreads();
+
+            // barrier_arrive_tx should fail on SM70 and SM80, because it is hidden.
+            auto token = cuda::device::barrier_arrive_tx(bar, 1, 0);
+
+#if defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__
+            static_assert(false, "Fail manually for SM90 and up.");
+#endif // __CUDA_MINIMUM_ARCH__
+    ));
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp
new file mode 100644
index 0000000000..d1ac1d66de
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "arrive_tx.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        // Required by concurrent_agents_launch to know how many we're
+        // launching. This can only be an int, because the nvrtc tests use grep
+        // to figure out how many threads to launch.
+        cuda_thread_count = 2;
+        ),
+        NV_IS_DEVICE, (
+            test();
+        )
+    );
+
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp
new file mode 100644
index 0000000000..0c75474e17
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "arrive_tx.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        // Required by concurrent_agents_launch to know how many we're
+        // launching. This can only be an int, because the nvrtc tests use grep
+        // to figure out how many threads to launch.
+        cuda_thread_count = 32;
+        ),
+        NV_IS_DEVICE, (
+            test();
+        )
+    );
+
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier_init.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/init.pass.cpp
similarity index 100%
rename from libcudacxx/.upstream-tests/test/cuda/barrier_init.pass.cpp
rename to libcudacxx/.upstream-tests/test/cuda/barrier/init.pass.cpp
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier_native_handle.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
similarity index 100%
rename from libcudacxx/.upstream-tests/test/cuda/barrier_native_handle.pass.cpp
rename to libcudacxx/.upstream-tests/test/cuda/barrier/native_handle.pass.cpp
diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md
index c20977c99b..3b057bf6c7 100644
--- a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md
+++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md
@@ -21,8 +21,9 @@ It has the same interface and semantics as [`cuda::std::barrier`], with the
 
 ## Barrier Operations
 
-| [`cuda::barrier::init`]                 | Initialize a `cuda::barrier`. `(friend function)`                 |
-| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function template)` |
+| [`cuda::barrier::init`]                 | Initialize a `cuda::barrier`. `(friend function)`                       |
+| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)`                |
+| [`cuda::device::barrier_arrive_tx`]     | Arrive on a `cuda::barrier` with transaction count update. `(function)` |
 
 ## NVCC `__shared__` Initialization Warnings
 
@@ -60,6 +61,41 @@ Programs shall ensure that this transformation would not introduce errors, for
 Under CUDA Compute Capability 6 (Pascal) or prior, an object of type
   `cuda::barrier` or `cuda::std::barrier` may not be used.
 
+## Shared memory barriers with transaction count
+
+In addition to the arrival count, a `cuda::barrier<thread_scope_block>` object
+located in shared memory supports a
+[tx-count](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object),
+which is used for tracking the completion of some asynchronous memory operations or
+transactions.
+The tx-count tracks the number of asynchronous transactions, in
+units specified by the asynchronous memory operation (typically bytes), that are
+outstanding and yet to be complete.
+This capability is exposed, starting with the Hopper architecture (CUDA Compute
+Capability 9).
+
+The tx-count of `cuda::barrier` must be set to the total amount of asynchronous
+memory operations, in units as specified by the asynchronous operations, to be
+tracked by the current phase. This can be achieved with the
+`cuda::device::barrier_arrive_tx` function call. Upon completion of each of the
+asynchronous operations, the tx-count of the `cuda::barrier` will be updated and
+thus progress the `cuda::barrier` towards the completion of the current phase.
+This may complete the current phase.
+
+### Phase Completion of a `cuda::barrier` with tx-count support
+
+Modify [[thread.barrier.class]](http://eel.is/c++draft/thread.barrier.class) as follows:
+ 
+> A barrier is a thread coordination mechanism whose lifetime consists of a sequence of barrier phases, where each phase allows at most an expected number of threads to block until the expected number of threads **and the expected number of transaction-based asynchronous operations** arrive at the barrier.
+> Each _barrier phase_ consists of the following steps:
+>
+> 1. The _expected count_ is decremented by each call to `arrive`,`arrive_and_drop`**, or `cuda::device::barrier_arrive_tx`**.
+> 2. **The _transaction count_ is incremented by each call to `cuda::device::barrier_arrive_tx` and decremented by the completion of transaction-based asynchronous operations such as `cuda::memcpy_async_tx`.**
+> 3. Exactly once after **both** the _expected count_ **and the _transaction count_** reach zero, a thread executes the _completion step_ during its call to `arrive`, `arrive_and_drop`, or `wait`, except that it is implementation-defined whether the step executes if no thread calls `wait`.
+> 4. When the completion step finishes, the _expected count_  is reset to what was specified by the `expected` argument to the constructor, possibly adjusted by calls to `arrive_and_drop`, **the _transaction count_ is reset to zero,** and the next phase starts.
+>
+> Concurrent invocations of the member functions of barrier **and the non-member barrier APIs in `cuda::device`**, other than its destructor, do not introduce data races. The member functions `arrive` and `arrive_and_drop`, **and the non-member function `cuda::device::barrier_arrive_tx`**, execute atomically.
+
 ## Implementation-Defined Behavior
 
 For each [`cuda::thread_scope`] `S` and `CompletionFunction` `F`, the value of
@@ -98,6 +134,7 @@ __global__ void example_kernel() {
 
 [`cuda::barrier::init`]: ./barrier/init.md
 [`cuda::device::barrier_native_handle`]: ./barrier/barrier_native_handle.md
+[`cuda::device::barrier_arrive_tx`]: ./barrier/barrier_arrive_tx.md
 
 [`cuda::std::barrier`]: https://en.cppreference.com/w/cpp/thread/barrier
 
diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md
new file mode 100644
index 0000000000..cc311ec46f
--- /dev/null
+++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_arrive_tx.md
@@ -0,0 +1,80 @@
+---
+grand_parent: Extended API
+parent: Barriers
+---
+
+# `cuda::device::barrier_arrive_tx`
+
+Defined in header `<cuda/barrier>`:
+
+```cuda
+__device__
+cuda::barrier<cuda::thread_scope_block>::arrival_token
+cuda::device::barrier_arrive_tx(
+  cuda::barrier<cuda::thread_scope_block>& bar,
+  ptrdiff_t arrive_count_update,
+  ptrdiff_t transaction_count_update);
+```
+
+Arrives at a barrier in shared memory, updating both the arrival count and
+transaction count.
+
+## Preconditions
+
+* `__isShared(&bar) == true`
+* `1 <= arrive_count_update && transaction_count_update <= (1 << 20) - 1`
+* `0 <= transaction_count_update && transaction_count_update <= (1 << 20) - 1`
+
+
+## Effects
+
+* This function constructs an arrival_token object associated with the phase
+  synchronization point for the current phase. Then, decrements the expected
+  arrival count by `arrive_count_update` and increments the expected transaction
+  count by `transaction_count_update`.
+* This function executes atomically. The call to this function strongly
+  happens-before the start of the phase completion step for the current phase.
+
+## Notes
+
+This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
+higher.
+
+To check if `cuda::device::barrier_arrive_tx` is available, use the
+`__cccl_lib_local_barrier_arrive_tx` feature flag, as shown in the example code
+below.
+
+## Return Value
+
+`cuda::device::barrier_arrive_tx` returns the constructed `arrival_token` object.
+
+## Example
+
+```cuda
+#include <cuda/barrier>
+#include <cuda/std/utility> // cuda::std::move
+
+#ifndef  __cccl_lib_local_barrier_arrive_tx
+static_assert(false, "Insufficient libcu++ version: cuda::device::arrive_tx is not yet available.");
+#endif // __cccl_lib_local_barrier_arrive_tx
+
+__global__ void example_kernel() {
+  __shared__ cuda::barrier<cuda::thread_scope_block> bar;
+  if (threadIdx.x == 0) {
+    init(&bar, blockDim.x);
+  }
+  __syncthreads();
+
+  auto token = cuda::device::barrier_arrive_tx(bar, 1, 0);
+
+  bar.wait(cuda::std::move(token));
+}
+```
+
+[See it on Godbolt](https://godbolt.org/z/1vxcGrT8j){: .btn }
+
+
+[`cuda::thread_scope`]: ./memory_model.md
+[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object
+[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12
+
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 3dac578901..c4354ec020 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -67,18 +67,13 @@ class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco> {
 
     _LIBCUDACXX_INLINE_VISIBILITY
     friend void init(barrier * __b, _CUDA_VSTD::ptrdiff_t __expected) {
-#if (_LIBCUDACXX_DEBUG_LEVEL >= 2)
-        _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0);
-#endif
-
+        _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
         new (__b) barrier(__expected);
     }
 
     _LIBCUDACXX_INLINE_VISIBILITY
     friend void init(barrier * __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion) {
-#if (_LIBCUDACXX_DEBUG_LEVEL >= 2)
-        _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0);
-#endif
+        _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
         new (__b) barrier(__expected, __completion);
     }
 };
@@ -182,10 +177,7 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
 
     _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
     arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1) {
-#if (_LIBCUDACXX_DEBUG_LEVEL >= 2)
-        _LIBCUDACXX_DEBUG_ASSERT(__update >= 0);
-        _LIBCUDACXX_DEBUG_ASSERT(__expected_unit >=0);
-#endif
+        _LIBCUDACXX_DEBUG_ASSERT(__update >= 0, "Arrival count update must be non-negative.");
         arrival_token __token = {};
         NV_DISPATCH_TARGET(
             NV_PROVIDES_SM_90, (
@@ -561,6 +553,65 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>
     return reinterpret_cast<_CUDA_VSTD::uint64_t *>(&b.__barrier);
 }
 
+
+// Hide arrive_tx when CUDA architecture is insufficient. Note the
+// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function
+// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__.
+#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+
+_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
+barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
+    barrier<thread_scope_block> & __b,
+    _CUDA_VSTD::ptrdiff_t __arrive_count_update,
+    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
+
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one.");
+    _LIBCUDACXX_DEBUG_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1.");
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+    barrier<thread_scope_block>::arrival_token __token = {};
+    NV_IF_TARGET(
+        // On architectures pre-sm90, arrive_tx is not supported.
+        NV_PROVIDES_SM_90, (
+            // We do not check for the statespace of the barrier here. This is
+            // on purpose. This allows debugging tools like memcheck/racecheck
+            // to detect that we are passing a pointer with the wrong state
+            // space to mbarrier.arrive. If we checked for the state space here,
+            // and __trap() if wrong, then those tools would not be able to help
+            // us in release builds. In debug builds, the error would be caught
+            // by the asserts at the top of this function.
+
+            auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+            if (__arrive_count_update == 1) {
+                asm (
+                    "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
+                    : "=l"(__token)
+                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+                    : "memory");
+            } else {
+                asm (
+                    "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+                    :
+                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+                    : "memory");
+                asm (
+                    "mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;"
+                    : "=l"(__token)
+                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update))
+                    : "memory");
+            }
+        )
+    );
+    return __token;
+}
+#endif // __CUDA_MINIMUM_ARCH__
+
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index b235b3db65..2f0afb3d2b 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -318,9 +318,7 @@ public:
         auto const __result = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update;
         auto const __new_expected = __expected.load(memory_order_relaxed);
 
-#if (_LIBCUDACXX_DEBUG_LEVEL >= 2)
-        _LIBCUDACXX_DEBUG_ASSERT(__result >= 0);
-#endif
+        _LIBCUDACXX_DEBUG_ASSERT(__result >= 0, "");
 
         if(0 == __result) {
             __completion();
@@ -384,10 +382,11 @@ private:
     static _LIBCUDACXX_INLINE_VISIBILITY constexpr
     uint64_t __init(ptrdiff_t __count) noexcept
     {
-#if (_LIBCUDACXX_DEBUG_LEVEL >= 2)
-        _LIBCUDACXX_DEBUG_ASSERT(__count >= 0);
-#endif
-
+#if _LIBCUDACXX_STD_VER > 11
+        // This debug assert is not supported in C++11 due to resulting in a
+        // multi-statement constexpr function.
+        _LIBCUDACXX_DEBUG_ASSERT(__count >= 0, "Count must be non-negative.");
+#endif // _LIBCUDACXX_STD_VER > 11
         return (((1u << 31) - __count) << 32)
               | ((1u << 31) - __count);
     }
@@ -414,9 +413,7 @@ public:
     _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_AFTER_CXX11
     __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion())
         : __phase_arrived_expected(__init(__count)) {
-#if (_LIBCUDACXX_DEBUG_LEVEL >= 2)
-        _LIBCUDACXX_DEBUG_ASSERT(__count >= 0);
-#endif
+        _LIBCUDACXX_DEBUG_ASSERT(__count >= 0, "");
     }
 
     ~__barrier_base() = default;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version
index 5697bfd25e..f52f38ccdc 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/version
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version
@@ -219,6 +219,15 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #pragma GCC system_header
 #endif
 
+// We define certain feature test macros depending on availability. When
+// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
+// unconditionally. This permits checking for the feature in host code. When
+// __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
+// hardware supports it.
+#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
+# define __cccl_lib_local_barrier_arrive_tx
+#endif
+
 // We unconditionally define `__cccl_lib_meow` so that there is only one place to set the value
 #if _LIBCUDACXX_STD_VER > 11
 # define __cccl_lib_chrono_udls                          201304L