diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier
index 25b14a7f18..50bfe664e8 100644
--- a/libcudacxx/include/cuda/barrier
+++ b/libcudacxx/include/cuda/barrier
@@ -50,7 +50,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 // capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
 // necessary to prevent cudafe from ripping out the device functions before
 // device compilation begins.
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
+#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 inline _LIBCUDACXX_DEVICE
@@ -288,7 +288,7 @@ void cp_async_bulk_wait_group_read()
                : "memory");
 }
 
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
similarity index 57%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
rename to libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 5dcccc5d1f..54f833a814 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -1,19 +1,18 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef __CCCL_PTX_ISA_H_
+#define __CCCL_PTX_ISA_H_
 
-#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
-#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
-
-#include <nv/target>            // __CUDA_MINIMUM_ARCH__ and friends
+#include "../__cccl/compiler.h"
+#include "../__cccl/system_header.h"
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -23,6 +22,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
 /*
  * Targeting macros
  *
@@ -31,47 +32,75 @@
  */
 
 // PTX ISA 8.3 is available from CUDA 12.3, driver r545
-#if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+// The first define is for future major versions of CUDACC.
+// We make sure that these get the highest known PTX ISA version.
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 830ULL
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
 // PTX ISA 8.2 is available from CUDA 12.2, driver r535
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 820ULL
 // PTX ISA 8.1 is available from CUDA 12.1, driver r530
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 810ULL
 // PTX ISA 8.0 is available from CUDA 12.0, driver r525
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 800ULL
 // PTX ISA 7.8 is available from CUDA 11.8, driver r520
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 780ULL
 // PTX ISA 7.7 is available from CUDA 11.7, driver r515
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 770ULL
 // PTX ISA 7.6 is available from CUDA 11.6, driver r510
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 760ULL
 // PTX ISA 7.5 is available from CUDA 11.5, driver r495
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 750ULL
 // PTX ISA 7.4 is available from CUDA 11.4, driver r470
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 740ULL
 // PTX ISA 7.3 is available from CUDA 11.3, driver r465
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 730ULL
 // PTX ISA 7.2 is available from CUDA 11.2, driver r460
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 720ULL
 // PTX ISA 7.1 is available from CUDA 11.1, driver r455
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 710ULL
 // PTX ISA 7.0 is available from CUDA 11.0, driver r445
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 700ULL
 // Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
 #else
 #  define __cccl_ptx_isa 0ULL
 #endif
 
-#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
+// We define certain feature test macros depending on availability. When
+// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
+// depending on PTX ISA. This permits checking for the feature in host code.
+// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
+// hardware supports it.
+#if (!defined(__CUDA_MINIMUM_ARCH__)) \
+  || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_isa_ptx >= 800
+#  define __cccl_lib_local_barrier_arrive_tx
+#  define __cccl_lib_experimental_ctk12_cp_async_exposure
+#endif
+
+#endif // __CCCL_PTX_ISA_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
index bb3a5078dd..0e8612926f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
@@ -15,6 +15,7 @@
 #include "__cccl/diagnostic.h"
 #include "__cccl/dialect.h"
 #include "__cccl/execution_space.h"
+#include "__cccl/ptx_isa.h"
 #include "__cccl/system_header.h"
 #include "__cccl/version.h"
 #include "__cccl/visibility.h"
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 3a0bcd10ea..59b204c828 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -572,11 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>
 
 #if defined(_CCCL_CUDA_COMPILER)
 
-// Hide arrive_tx when CUDA architecture is insufficient. Note the
-// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function
-// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__.
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     barrier<thread_scope_block> & __b,
@@ -591,7 +588,7 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
 
     barrier<thread_scope_block>::arrival_token __token = {};
-    NV_IF_TARGET(
+    NV_IF_ELSE_TARGET(
         // On architectures pre-sm90, arrive_tx is not supported.
         NV_PROVIDES_SM_90, (
             // We do not check for the statespace of the barrier here. This is
@@ -619,11 +616,47 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
                     _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
                 );
             }
+        ),(
+            __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
         )
     );
     return __token;
 }
 
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
+_LIBCUDACXX_DEVICE inline
+void barrier_expect_tx(
+    barrier<thread_scope_block> & __b,
+    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
+
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+    // We do not check for the statespace of the barrier here. This is
+    // on purpose. This allows debugging tools like memcheck/racecheck
+    // to detect that we are passing a pointer with the wrong state
+    // space to mbarrier.arrive. If we checked for the state space here,
+    // and __trap() if wrong, then those tools would not be able to help
+    // us in release builds. In debug builds, the error would be caught
+    // by the asserts at the top of this function.
+    NV_IF_ELSE_TARGET(
+        // On architectures pre-sm90, arrive_tx is not supported.
+        NV_PROVIDES_SM_90, (
+    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+    asm (
+        "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+        :
+        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+          "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+        : "memory");
+    ),(
+        __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
+    ));
+}
+
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
 template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
 _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _Tp* __dest,
@@ -643,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
     _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
 
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
     if (__isShared(__dest) && __isGlobal(__src)) {
         asm volatile(
@@ -660,36 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
         // is not yet implemented. So we trap in this case as well.
         _LIBCUDACXX_UNREACHABLE();
     }
+  ),(
+    __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
+  ));
 
     return async_contract_fulfillment::async;
 }
-
-_LIBCUDACXX_DEVICE inline
-void barrier_expect_tx(
-    barrier<thread_scope_block> & __b,
-    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
-
-    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
-    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
-    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
-
-    // We do not check for the statespace of the barrier here. This is
-    // on purpose. This allows debugging tools like memcheck/racecheck
-    // to detect that we are passing a pointer with the wrong state
-    // space to mbarrier.arrive. If we checked for the state space here,
-    // and __trap() if wrong, then those tools would not be able to help
-    // us in release builds. In debug builds, the error would be caught
-    // by the asserts at the top of this function.
-    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
-    asm (
-        "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-        : "memory");
-}
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_ptx_isa >= 800
 #endif // _CCCL_CUDA_COMPILER
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
@@ -763,6 +774,7 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD
 // The user is still responsible for arriving and waiting on (or otherwise
 // synchronizing with) the barrier or pipeline barrier to see the results of
 // copies from other threads participating in the synchronization object.
+extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
 struct __memcpy_completion_impl {
 
     template<typename _Group>
@@ -796,13 +808,17 @@ struct __memcpy_completion_impl {
                 // bulk group to be used with shared memory barriers.
                 _LIBCUDACXX_UNREACHABLE();
             case __completion_mechanism::__mbarrier_complete_tx:
+#if __cccl_isa_ptx >= 800
                 // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
                 NV_IF_TARGET(NV_PROVIDES_SM_90, (
                     // Only perform the expect_tx operation with the leader thread
                     if (__group.thread_rank() == 0) {
                         ::cuda::device::barrier_expect_tx(__barrier, __size);
                     }
+                ),(
+                    __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
                 ));
+#endif // __cccl_isa_ptx >= 800
                 return async_contract_fulfillment::async;
             case __completion_mechanism::__sync:
                 // sync: In this case, we do not need to do anything. The user will have
@@ -929,11 +945,13 @@ struct __memcpy_completion_impl {
  * 5. normal synchronous copy (fallback)
  ***********************************************************************/
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
 template <typename _Group>
 inline __device__
 void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if (__g.thread_rank() == 0) {
         asm volatile(
             "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
@@ -944,10 +962,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
               "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle)))
             : "memory");
     }
+  ),(
+    __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_ptx_isa >= 800
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
 template <size_t _Copy_size>
 inline __device__
 void __cp_async_shared_global(char * __dest, const char * __src) {
@@ -959,6 +980,7 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
     static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16.");
 #endif // _CCCL_STD_VER >= 2017
 
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
     asm volatile(
         "cp.async.ca.shared.global [%0], [%1], %2, %2;"
         :
@@ -966,6 +988,9 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
           "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
           "n"(_Copy_size)
         : "memory");
+  ),(
+    __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
+  ));
 }
 
 template <>
@@ -973,6 +998,7 @@ inline __device__
 void __cp_async_shared_global<16>(char * __dest, const char * __src) {
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
     // When copying 16 bytes, it is possible to skip L1 cache (.cg).
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
     asm volatile(
         "cp.async.cg.shared.global [%0], [%1], %2, %2;"
         :
@@ -980,6 +1006,9 @@ void __cp_async_shared_global<16>(char * __dest, const char * __src) {
           "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
           "n"(16)
         : "memory");
+  ),(
+    __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
+  ));
 }
 
 template <size_t _Alignment, typename _Group>
@@ -1002,7 +1031,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char *
         __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
     }
 }
-#endif // __CUDA_MINIMUM_ARCH__
 
 template <size_t _Copy_size>
 struct __copy_chunk {
@@ -1083,6 +1111,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
 template<_CUDA_VSTD::size_t _Align, typename _Group>
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) {
+#if __cccl_ptx_isa >= 800
     NV_IF_TARGET(NV_PROVIDES_SM_90, (
         const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
         _LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx.");
@@ -1094,6 +1123,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
         }
         // Fallthrough to SM 80..
     ));
+#endif // __cccl_ptx_isa >= 800
 
     NV_IF_TARGET(NV_PROVIDES_SM_80, (
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index c16965d23b..6936733858 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -26,7 +26,6 @@
 
 #include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
 
-#include "../__cuda/ptx/ptx_isa_target_macros.h"
 #include "../__cuda/ptx/ptx_dot_variants.h"
 #include "../__cuda/ptx/ptx_helper_functions.h"
 #include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index dbd97e2b63..0666be92e6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -26,7 +26,6 @@
 
 #include "ptx_dot_variants.h"
 #include "ptx_helper_functions.h"
-#include "ptx_isa_target_macros.h"
 #include "../../cstdint"
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version
index c6d15ae72f..61be53c671 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/version
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version
@@ -223,16 +223,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #endif
 #endif // __cuda_std__
 
-// We define certain feature test macros depending on availability. When
-// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
-// unconditionally. This permits checking for the feature in host code. When
-// __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
-// hardware supports it.
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
-# define __cccl_lib_local_barrier_arrive_tx
-# define __cccl_lib_experimental_ctk12_cp_async_exposure
-#endif
-
 // We unconditionally define `__cccl_lib_meow` so that there is only one place to set the value
 #if _CCCL_STD_VER > 2011
 # define __cccl_lib_chrono_udls                          201304L
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
index 8f82188e41..69ed6bb41c 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
index 0d2f5ca841..55038a41f1 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
index bcf6dcf622..eded8d4e2a 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
index 1a40fcb78a..8729995fef 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
index b31278ead5..58b84d97de 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
@@ -28,9 +28,9 @@ int main(int, char**){
             // barrier_arrive_tx should fail on SM70 and SM80, because it is hidden.
             auto token = cuda::device::barrier_arrive_tx(bar, 1, 0);
 
-#if defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__
+#ifdef __cccl_lib_local_barrier_arrive_tx
             static_assert(false, "Fail manually for SM90 and up.");
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_local_barrier_arrive_tx
     ));
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
index a6ac3803bd..8dcb6645a3 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
index 4cc310d958..14aa35116a 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
index d32aea8572..db061cb4f6 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
index e98ee5e6aa..72b0667fbf 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
index eb4e9b876d..8b77ee210b 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
index fb58b58315..7d68e887de 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
index 6d890edc9b..a289c71c2d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
index e394515666..26e04f8d96 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
index b10c2bfc26..88f45093c4 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
index 8e7886db6a..0ba064cf93 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
index 8be14c21c1..af1f27454d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
index 8d29fa6df1..6ac6e40db1 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -7,9 +7,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: libcpp-has-no-threads
-// UNSUPPORTED: pre-sm-90
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
index 771eb6fde7..57b871e424 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
index 6da34c14af..c2e36ce997 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
index 5ec98c26ee..9a347e0e7d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
index bf626a902b..b917e9e036 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
index 6614bfb51f..bd82a594ac 100644
--- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>