From 33590ac34165594baab8593dee2e7e42537b72c8 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 9 Feb 2024 11:47:25 +0100
Subject: [PATCH 1/8] Move `__cccl_ptx_isa` into `__cccl_config`

We want this to be globally available
---
 .../ptx_isa.h}                                | 64 +++++++++++++------
 .../std/detail/libcxx/include/__cccl_config   |  1 +
 .../std/detail/libcxx/include/__cuda/ptx.h    |  1 -
 ..._and_communication_instructions_mbarrier.h |  1 -
 .../cuda/std/detail/libcxx/include/version    | 10 ---
 5 files changed, 45 insertions(+), 32 deletions(-)
 rename libcudacxx/include/cuda/std/detail/libcxx/include/{__cuda/ptx/ptx_isa_target_macros.h => __cccl/ptx_isa.h} (60%)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
similarity index 60%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
rename to libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 5dcccc5d1f..28566f77d9 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -1,19 +1,18 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef __CCCL_PTX_ISA_H_
+#define __CCCL_PTX_ISA_H_
 
-#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
-#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
-
-#include <nv/target>            // __CUDA_MINIMUM_ARCH__ and friends
+#include "../__cccl/compiler.h"
+#include "../__cccl/system_header.h"
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -23,6 +22,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
 /*
  * Targeting macros
  *
@@ -31,47 +32,70 @@
  */
 
 // PTX ISA 8.3 is available from CUDA 12.3, driver r545
-#if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
 // PTX ISA 8.2 is available from CUDA 12.2, driver r535
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 820ULL
 // PTX ISA 8.1 is available from CUDA 12.1, driver r530
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 810ULL
 // PTX ISA 8.0 is available from CUDA 12.0, driver r525
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 800ULL
 // PTX ISA 7.8 is available from CUDA 11.8, driver r520
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 780ULL
 // PTX ISA 7.7 is available from CUDA 11.7, driver r515
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 770ULL
 // PTX ISA 7.6 is available from CUDA 11.6, driver r510
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 760ULL
 // PTX ISA 7.5 is available from CUDA 11.5, driver r495
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 750ULL
 // PTX ISA 7.4 is available from CUDA 11.4, driver r470
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 740ULL
 // PTX ISA 7.3 is available from CUDA 11.3, driver r465
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 730ULL
 // PTX ISA 7.2 is available from CUDA 11.2, driver r460
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 720ULL
 // PTX ISA 7.1 is available from CUDA 11.1, driver r455
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 710ULL
 // PTX ISA 7.0 is available from CUDA 11.0, driver r445
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 700ULL
 // Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
 #else
 #  define __cccl_ptx_isa 0ULL
 #endif
 
-#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
+// We define certain feature test macros depending on availability. When
+// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
+// unconditionally. This permits checking for the feature in host code.
+// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
+// hardware supports it.
+#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) \
+# define __cccl_lib_local_barrier_arrive_tx
+# define __cccl_lib_experimental_ctk12_cp_async_exposure
+#endif
+
+#endif // __CCCL_PTX_ISA_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
index bb3a5078dd..0e8612926f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
@@ -15,6 +15,7 @@
 #include "__cccl/diagnostic.h"
 #include "__cccl/dialect.h"
 #include "__cccl/execution_space.h"
+#include "__cccl/ptx_isa.h"
 #include "__cccl/system_header.h"
 #include "__cccl/version.h"
 #include "__cccl/visibility.h"
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index c16965d23b..6936733858 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -26,7 +26,6 @@
 
 #include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
 
-#include "../__cuda/ptx/ptx_isa_target_macros.h"
 #include "../__cuda/ptx/ptx_dot_variants.h"
 #include "../__cuda/ptx/ptx_helper_functions.h"
 #include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index c271d5588a..d8226d09f9 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -14,7 +14,6 @@
 
 #include "ptx_dot_variants.h"
 #include "ptx_helper_functions.h"
-#include "ptx_isa_target_macros.h"
 #include "../../cstdint"
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version
index ddfc85fc55..bb9f8cbbbf 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/version
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version
@@ -223,16 +223,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #  pragma system_header
 #endif // no system header
 
-// We define certain feature test macros depending on availability. When
-// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
-// unconditionally. This permits checking for the feature in host code. When
-// __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
-// hardware supports it.
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
-# define __cccl_lib_local_barrier_arrive_tx
-# define __cccl_lib_experimental_ctk12_cp_async_exposure
-#endif
-
 // We unconditionally define `__cccl_lib_meow` so that there is only one place to set the value
 #if _CCCL_STD_VER > 2011
 # define __cccl_lib_chrono_udls                          201304L

From b6ab388a068a4947e53f131dbfe28052324961ad Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 9 Feb 2024 11:29:18 +0100
Subject: [PATCH 2/8] Guard ptx helper functions also on available PTX ISA

Currently we only guard those instructions based on the available architecture. However, it is also valid to compile with an old toolkit for a new machine.

Consequently we need to strengthen our checks against available PTX ISA
---
 libcudacxx/include/cuda/barrier               |  4 +-
 .../detail/libcxx/include/__cccl/ptx_isa.h    |  3 +-
 .../detail/libcxx/include/__cuda/barrier.h    | 67 ++++++++++---------
 ...arrive_tx_static_asserts_pre_sm90.fail.cpp |  4 +-
 4 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier
index 25b14a7f18..50bfe664e8 100644
--- a/libcudacxx/include/cuda/barrier
+++ b/libcudacxx/include/cuda/barrier
@@ -50,7 +50,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 // capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
 // necessary to prevent cudafe from ripping out the device functions before
 // device compilation begins.
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
+#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 inline _LIBCUDACXX_DEVICE
@@ -288,7 +288,7 @@ void cp_async_bulk_wait_group_read()
                : "memory");
 }
 
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 28566f77d9..e1780381bf 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -90,10 +90,11 @@
 
 // We define certain feature test macros depending on availability. When
 // __CUDA_MINIMUM_ARCH__ is not available, we define the following features
-// unconditionally. This permits checking for the feature in host code.
+// depending on PTX ISA. This permits checking for the feature in host code.
 // When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
 // hardware supports it.
 #if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) \
+ && __cccl_isa_ptx >= 800
 # define __cccl_lib_local_barrier_arrive_tx
 # define __cccl_lib_experimental_ctk12_cp_async_exposure
 #endif
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 3a0bcd10ea..0d1e2ea32b 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -572,10 +572,7 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>
 
 #if defined(_CCCL_CUDA_COMPILER)
 
-// Hide arrive_tx when CUDA architecture is insufficient. Note the
-// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function
-// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__.
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#ifdef __cccl_lib_local_barrier_arrive_tx
 
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
@@ -624,6 +621,34 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     return __token;
 }
 
+_LIBCUDACXX_DEVICE inline
+void barrier_expect_tx(
+    barrier<thread_scope_block> & __b,
+    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
+
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+    // We do not check for the statespace of the barrier here. This is
+    // on purpose. This allows debugging tools like memcheck/racecheck
+    // to detect that we are passing a pointer with the wrong state
+    // space to mbarrier.arrive. If we checked for the state space here,
+    // and __trap() if wrong, then those tools would not be able to help
+    // us in release builds. In debug builds, the error would be caught
+    // by the asserts at the top of this function.
+    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+    asm (
+        "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+        :
+        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+          "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+        : "memory");
+}
+#endif // __cccl_lib_local_barrier_arrive_tx
+
+#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
 template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
 _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _Tp* __dest,
@@ -663,33 +688,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
 
     return async_contract_fulfillment::async;
 }
-
-_LIBCUDACXX_DEVICE inline
-void barrier_expect_tx(
-    barrier<thread_scope_block> & __b,
-    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
-
-    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
-    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
-    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
-
-    // We do not check for the statespace of the barrier here. This is
-    // on purpose. This allows debugging tools like memcheck/racecheck
-    // to detect that we are passing a pointer with the wrong state
-    // space to mbarrier.arrive. If we checked for the state space here,
-    // and __trap() if wrong, then those tools would not be able to help
-    // us in release builds. In debug builds, the error would be caught
-    // by the asserts at the top of this function.
-    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
-    asm (
-        "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-        : "memory");
-}
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_local_barrier_arrive_tx
 #endif // _CCCL_CUDA_COMPILER
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
@@ -929,7 +928,7 @@ struct __memcpy_completion_impl {
  * 5. normal synchronous copy (fallback)
  ***********************************************************************/
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
 template <typename _Group>
 inline __device__
 void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
@@ -945,7 +944,7 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
             : "memory");
     }
 }
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
 
 #if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
 template <size_t _Copy_size>
@@ -1083,6 +1082,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
 template<_CUDA_VSTD::size_t _Align, typename _Group>
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) {
+#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
     NV_IF_TARGET(NV_PROVIDES_SM_90, (
         const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
         _LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx.");
@@ -1094,6 +1094,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
         }
         // Fallthrough to SM 80..
     ));
+#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
 
     NV_IF_TARGET(NV_PROVIDES_SM_80, (
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
index b31278ead5..58b84d97de 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
@@ -28,9 +28,9 @@ int main(int, char**){
             // barrier_arrive_tx should fail on SM70 and SM80, because it is hidden.
             auto token = cuda::device::barrier_arrive_tx(bar, 1, 0);
 
-#if defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__
+#ifdef __cccl_lib_local_barrier_arrive_tx
             static_assert(false, "Fail manually for SM90 and up.");
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_local_barrier_arrive_tx
     ));
     return 0;
 }

From 52e744db8fe01f6886ec1bf361558c9292d4fb36 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 9 Feb 2024 11:52:07 +0100
Subject: [PATCH 3/8] Future proof ptx isa detection

---
 .../include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index e1780381bf..8b104a8290 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -32,7 +32,9 @@
  */
 
 // PTX ISA 8.3 is available from CUDA 12.3, driver r545
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
+#if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 830ULL
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
   || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
 // PTX ISA 8.2 is available from CUDA 12.2, driver r535

From 41a2c34b523d0c7684339123e9af8f2256835f9c Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 9 Feb 2024 12:15:19 +0100
Subject: [PATCH 4/8] Add `__cccl_lib_cp_async_{bulk_}available` convenience
 macros

---
 .../std/detail/libcxx/include/__cccl/ptx_isa.h |  6 ++++++
 .../std/detail/libcxx/include/__cuda/barrier.h | 18 ++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 8b104a8290..694ace1ef7 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -95,10 +95,16 @@
 // depending on PTX ISA. This permits checking for the feature in host code.
 // When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
 // hardware supports it.
+#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) \
+ && __cccl_isa_ptx >= 700
+# define __cccl_lib_cp_async_available
+#endif
+
 #if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) \
  && __cccl_isa_ptx >= 800
 # define __cccl_lib_local_barrier_arrive_tx
 # define __cccl_lib_experimental_ctk12_cp_async_exposure
+# define __cccl_lib_cp_async_bulk_available
 #endif
 
 #endif // __CCCL_PTX_ISA_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 0d1e2ea32b..fd9b7e5773 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -648,7 +648,7 @@ void barrier_expect_tx(
 }
 #endif // __cccl_lib_local_barrier_arrive_tx
 
-#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
+#ifdef __cccl_lib_cp_async_bulk_available
 template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
 _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _Tp* __dest,
@@ -688,7 +688,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
 
     return async_contract_fulfillment::async;
 }
-#endif // __cccl_lib_local_barrier_arrive_tx
+#endif // __cccl_lib_cp_async_bulk_available
 #endif // _CCCL_CUDA_COMPILER
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
@@ -928,7 +928,7 @@ struct __memcpy_completion_impl {
  * 5. normal synchronous copy (fallback)
  ***********************************************************************/
 
-#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
+#ifdef __cccl_lib_cp_async_bulk_available
 template <typename _Group>
 inline __device__
 void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
@@ -944,9 +944,9 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
             : "memory");
     }
 }
-#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
+#endif // __cccl_lib_cp_async_bulk_available
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#ifdef __cccl_lib_cp_async_available
 template <size_t _Copy_size>
 inline __device__
 void __cp_async_shared_global(char * __dest, const char * __src) {
@@ -1001,7 +1001,7 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char *
         __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
     }
 }
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_cp_async_available
 
 template <size_t _Copy_size>
 struct __copy_chunk {
@@ -1082,7 +1082,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
 template<_CUDA_VSTD::size_t _Align, typename _Group>
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) {
-#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
+#ifdef __cccl_lib_cp_async_bulk_available
     NV_IF_TARGET(NV_PROVIDES_SM_90, (
         const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
         _LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx.");
@@ -1094,8 +1094,9 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
         }
         // Fallthrough to SM 80..
     ));
-#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
+#endif // __cccl_lib_cp_async_bulk_available
 
+#ifdef __cccl_lib_cp_async_available
     NV_IF_TARGET(NV_PROVIDES_SM_80, (
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
             const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
@@ -1106,6 +1107,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
         }
         // Fallthrough..
     ));
+#endif // __cccl_lib_cp_async_available
 
     __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size);
     return __completion_mechanism::__sync;

From 820d670962a7911bf13ab981ddb2827a6950b4c5 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 9 Feb 2024 13:35:16 +0100
Subject: [PATCH 5/8] Ensure that we can build barrier on old CTK and Hopper

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h     | 2 ++
 .../test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp  | 1 +
 libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp  | 1 +
 .../test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp   | 1 +
 .../libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp     | 1 +
 .../cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp     | 1 +
 .../test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp      | 1 +
 libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp | 1 +
 8 files changed, 9 insertions(+)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index fd9b7e5773..8398d98e86 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -795,6 +795,7 @@ struct __memcpy_completion_impl {
                 // bulk group to be used with shared memory barriers.
                 _LIBCUDACXX_UNREACHABLE();
             case __completion_mechanism::__mbarrier_complete_tx:
+#ifdef __cccl_lib_local_barrier_arrive_tx
                 // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
                 NV_IF_TARGET(NV_PROVIDES_SM_90, (
                     // Only perform the expect_tx operation with the leader thread
@@ -802,6 +803,7 @@ struct __memcpy_completion_impl {
                         ::cuda::device::barrier_expect_tx(__barrier, __size);
                     }
                 ));
+#endif // __cccl_lib_local_barrier_arrive_tx
                 return async_contract_fulfillment::async;
             case __completion_mechanism::__sync:
                 // sync: In this case, we do not need to do anything. The user will have
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
index 8f82188e41..12271d1ddb 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
index 0d2f5ca841..187a6768c1 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
index bcf6dcf622..c5ef8b6556 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
index 1a40fcb78a..91ae5d36c3 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
index 58b84d97de..d158d55d57 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: libcpp-has-no-threads
 
 // UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
index a6ac3803bd..235f681588 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
index 4cc310d958..cc6b68daf4 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 

From 00401b916b24bf1864ea0f0ab1252e202567fb68 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 9 Feb 2024 13:39:06 +0100
Subject: [PATCH 6/8] Add comment on max ISA

---
 .../include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 694ace1ef7..6a624eeb61 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -32,6 +32,8 @@
  */
 
 // PTX ISA 8.3 is available from CUDA 12.3, driver r545
+// The first define is for future major versions of CUDACC.
+// We make sure that these get the highest known PTX ISA version.
 #if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \

From 47f524ae6fa6b182549b6dbd6fe7b6bd46c1d1af Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 15 Feb 2024 09:23:46 +0000
Subject: [PATCH 7/8] More test fixes

---
 .../detail/libcxx/include/__cccl/ptx_isa.h    | 16 ++---
 .../detail/libcxx/include/__cuda/barrier.h    | 61 +++++++++++++------
 .../barrier/arrive_tx_cluster.runfail.cpp     |  2 +-
 .../cuda/barrier/arrive_tx_cta.pass.cpp       |  2 +-
 .../cuda/barrier/arrive_tx_device.runfail.cpp |  2 +-
 .../barrier/arrive_tx_feature_test.pass.cpp   |  2 +-
 ...arrive_tx_static_asserts_pre_sm90.fail.cpp |  1 -
 .../cuda/barrier/arrive_tx_thread.pass.cpp    |  2 +-
 .../cuda/barrier/arrive_tx_warp.pass.cpp      |  2 +-
 .../cuda/barrier/cp_async_bulk.pass.cpp       |  1 +
 .../cp_async_bulk_feature_test.pass.cpp       |  1 +
 .../cp_async_bulk_ptx_compiles.pass.cpp       |  1 +
 .../barrier/cp_async_bulk_tensor.pass.cpp     |  1 +
 .../barrier/cp_async_bulk_tensor_1d.pass.cpp  |  1 +
 .../barrier/cp_async_bulk_tensor_2d.pass.cpp  |  1 +
 .../barrier/cp_async_bulk_tensor_3d.pass.cpp  |  1 +
 .../barrier/cp_async_bulk_tensor_4d.pass.cpp  |  1 +
 .../barrier/cp_async_bulk_tensor_5d.pass.cpp  |  1 +
 .../barrier/cp_async_bulk_tensor_generic.h    |  3 -
 .../cuda/barrier/expect_tx_cta.pass.cpp       |  1 +
 .../cuda/barrier/expect_tx_device.runfail.cpp |  1 +
 .../cuda/barrier/expect_tx_thread.pass.cpp    |  1 +
 .../cuda/barrier/expect_tx_warp.pass.cpp      |  1 +
 .../memcpy_async/memcpy_async_tx.pass.cpp     |  1 +
 24 files changed, 69 insertions(+), 38 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 6a624eeb61..54f833a814 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -34,7 +34,7 @@
 // PTX ISA 8.3 is available from CUDA 12.3, driver r545
 // The first define is for future major versions of CUDACC.
 // We make sure that these get the highest known PTX ISA version.
-#if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
   || (!defined(__CUDACC_VER_MAJOR__))
@@ -97,16 +97,10 @@
 // depending on PTX ISA. This permits checking for the feature in host code.
 // When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
 // hardware supports it.
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) \
- && __cccl_isa_ptx >= 700
-# define __cccl_lib_cp_async_available
-#endif
-
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) \
- && __cccl_isa_ptx >= 800
-# define __cccl_lib_local_barrier_arrive_tx
-# define __cccl_lib_experimental_ctk12_cp_async_exposure
-# define __cccl_lib_cp_async_bulk_available
+#if (!defined(__CUDA_MINIMUM_ARCH__)) \
+  || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_isa_ptx >= 800
+#  define __cccl_lib_local_barrier_arrive_tx
+#  define __cccl_lib_experimental_ctk12_cp_async_exposure
 #endif
 
 #endif // __CCCL_PTX_ISA_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 8398d98e86..58bd0473fb 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -572,8 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>
 
 #if defined(_CCCL_CUDA_COMPILER)
 
-#ifdef __cccl_lib_local_barrier_arrive_tx
-
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     barrier<thread_scope_block> & __b,
@@ -588,7 +588,7 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
 
     barrier<thread_scope_block>::arrival_token __token = {};
-    NV_IF_TARGET(
+    NV_IF_ELSE_TARGET(
         // On architectures pre-sm90, arrive_tx is not supported.
         NV_PROVIDES_SM_90, (
             // We do not check for the statespace of the barrier here. This is
@@ -616,11 +616,14 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
                     _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
                 );
             }
+        ),(
+            __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
         )
     );
     return __token;
 }
 
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
 _LIBCUDACXX_DEVICE inline
 void barrier_expect_tx(
     barrier<thread_scope_block> & __b,
@@ -638,6 +641,9 @@ void barrier_expect_tx(
     // and __trap() if wrong, then those tools would not be able to help
     // us in release builds. In debug builds, the error would be caught
     // by the asserts at the top of this function.
+    NV_IF_ELSE_TARGET(
+        // On architectures pre-sm90, arrive_tx is not supported.
+        NV_PROVIDES_SM_90, (
     auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
     asm (
         "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
@@ -645,10 +651,12 @@ void barrier_expect_tx(
         : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
           "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
         : "memory");
+    ),(
+        __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
+    ));
 }
-#endif // __cccl_lib_local_barrier_arrive_tx
 
-#ifdef __cccl_lib_cp_async_bulk_available
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
 template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
 _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _Tp* __dest,
@@ -668,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
     _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
 
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
     if (__isShared(__dest) && __isGlobal(__src)) {
         asm volatile(
@@ -685,10 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
         // is not yet implemented. So we trap in this case as well.
         _LIBCUDACXX_UNREACHABLE();
     }
+  ),(
+    __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
+  ));
 
     return async_contract_fulfillment::async;
 }
-#endif // __cccl_lib_cp_async_bulk_available
+#endif // __cccl_ptx_isa >= 800
 #endif // _CCCL_CUDA_COMPILER
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
@@ -762,6 +774,7 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD
 // The user is still responsible for arriving and waiting on (or otherwise
 // synchronizing with) the barrier or pipeline barrier to see the results of
 // copies from other threads participating in the synchronization object.
+extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
 struct __memcpy_completion_impl {
 
     template<typename _Group>
@@ -795,15 +808,17 @@ struct __memcpy_completion_impl {
                 // bulk group to be used with shared memory barriers.
                 _LIBCUDACXX_UNREACHABLE();
             case __completion_mechanism::__mbarrier_complete_tx:
-#ifdef __cccl_lib_local_barrier_arrive_tx
+#if __cccl_isa_ptx >= 800
                 // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
                 NV_IF_TARGET(NV_PROVIDES_SM_90, (
                     // Only perform the expect_tx operation with the leader thread
                     if (__group.thread_rank() == 0) {
                         ::cuda::device::barrier_expect_tx(__barrier, __size);
                     }
+                ),(
+                    __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
                 ));
-#endif // __cccl_lib_local_barrier_arrive_tx
+#endif // __cccl_isa_ptx >= 800
                 return async_contract_fulfillment::async;
             case __completion_mechanism::__sync:
                 // sync: In this case, we do not need to do anything. The user will have
@@ -930,11 +945,13 @@ struct __memcpy_completion_impl {
  * 5. normal synchronous copy (fallback)
  ***********************************************************************/
 
-#ifdef __cccl_lib_cp_async_bulk_available
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
 template <typename _Group>
 inline __device__
 void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if (__g.thread_rank() == 0) {
         asm volatile(
             "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
@@ -945,10 +962,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
               "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle)))
             : "memory");
     }
+  ),(
+    __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __cccl_lib_cp_async_bulk_available
+#endif // __cccl_ptx_isa >= 800
 
-#ifdef __cccl_lib_cp_async_available
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
 template <size_t _Copy_size>
 inline __device__
 void __cp_async_shared_global(char * __dest, const char * __src) {
@@ -960,6 +980,7 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
     static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16.");
 #endif // _CCCL_STD_VER >= 2017
 
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
     asm volatile(
         "cp.async.ca.shared.global [%0], [%1], %2, %2;"
         :
@@ -967,6 +988,9 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
           "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
           "n"(_Copy_size)
         : "memory");
+  ),(
+    __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
+  ));
 }
 
 template <>
@@ -974,6 +998,7 @@ inline __device__
 void __cp_async_shared_global<16>(char * __dest, const char * __src) {
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
     // When copying 16 bytes, it is possible to skip L1 cache (.cg).
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
     asm volatile(
         "cp.async.cg.shared.global [%0], [%1], %2, %2;"
         :
@@ -981,6 +1006,9 @@ void __cp_async_shared_global<16>(char * __dest, const char * __src) {
           "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
           "n"(16)
         : "memory");
+  ),(
+    __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
+  ));
 }
 
 template <size_t _Alignment, typename _Group>
@@ -1003,7 +1031,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char *
         __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
     }
 }
-#endif // __cccl_lib_cp_async_available
 
 template <size_t _Copy_size>
 struct __copy_chunk {
@@ -1084,7 +1111,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
 template<_CUDA_VSTD::size_t _Align, typename _Group>
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) {
-#ifdef __cccl_lib_cp_async_bulk_available
+#if __cccl_ptx_isa >= 800
     NV_IF_TARGET(NV_PROVIDES_SM_90, (
         const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
         _LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx.");
@@ -1096,10 +1123,9 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
         }
         // Fallthrough to SM 80..
     ));
-#endif // __cccl_lib_cp_async_bulk_available
+#endif // __cccl_ptx_isa >= 800
 
-#ifdef __cccl_lib_cp_async_available
-    NV_IF_TARGET(NV_PROVIDES_SM_80, (
+    NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
             const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
             if (__can_use_async_group) {
@@ -1108,8 +1134,9 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
             }
         }
         // Fallthrough..
+    ),(
+        __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
     ));
-#endif // __cccl_lib_cp_async_available
 
     __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size);
     return __completion_mechanism::__sync;
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
index 12271d1ddb..69ed6bb41c 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
@@ -10,7 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
-// UNSUPPORTED: nvcc-11.1
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
index 187a6768c1..55038a41f1 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
@@ -10,7 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
-// UNSUPPORTED: nvcc-11.1
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
index c5ef8b6556..eded8d4e2a 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
@@ -10,7 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
-// UNSUPPORTED: nvcc-11.1
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
index 91ae5d36c3..8729995fef 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
@@ -10,7 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
-// UNSUPPORTED: nvcc-11.1
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
index d158d55d57..58b84d97de 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: libcpp-has-no-threads
 
 // UNSUPPORTED: pre-sm-70
-// UNSUPPORTED: nvcc-11.1
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
index 235f681588..8dcb6645a3 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
@@ -10,7 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
-// UNSUPPORTED: nvcc-11.1
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
index cc6b68daf4..14aa35116a 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
@@ -10,7 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
-// UNSUPPORTED: nvcc-11.1
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
index d32aea8572..db061cb4f6 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
index e98ee5e6aa..72b0667fbf 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
index eb4e9b876d..8b77ee210b 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
index fb58b58315..7d68e887de 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
index 6d890edc9b..a289c71c2d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
index e394515666..26e04f8d96 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
index b10c2bfc26..88f45093c4 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
index 8e7886db6a..0ba064cf93 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
index 8be14c21c1..af1f27454d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
index 8d29fa6df1..6ac6e40db1 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -7,9 +7,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: libcpp-has-no-threads
-// UNSUPPORTED: pre-sm-90
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
index 771eb6fde7..57b871e424 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
index 6da34c14af..c2e36ce997 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
index 5ec98c26ee..9a347e0e7d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
index bf626a902b..b917e9e036 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
index 6614bfb51f..bd82a594ac 100644
--- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 

From c5be115ff57e7db1ad79cc59b346044565f4892f Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 16 Feb 2024 17:09:18 +0100
Subject: [PATCH 8/8] Do not introduce linker error when we there is a fallback

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 58bd0473fb..59b204c828 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -1125,7 +1125,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
     ));
 #endif // __cccl_ptx_isa >= 800
 
-    NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (
+    NV_IF_TARGET(NV_PROVIDES_SM_80, (
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
             const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
             if (__can_use_async_group) {
@@ -1134,8 +1134,6 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
             }
         }
         // Fallthrough..
-    ),(
-        __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
     ));
 
     __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size);