From e11885c41d35e0a48d91abf07d200c400bdf0cfa Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 21 Feb 2024 00:50:43 -0800
Subject: [PATCH 1/6] Fix ptx usage to account for PTX ISA availability (#1359)

Currently we only guard those instructions based on the available architecture. However, it is also valid to compile with an old toolkit for a new machine.

Consequently we need to strengthen our checks against available PTX ISA
---
 libcudacxx/include/cuda/barrier               |   4 +-
 .../ptx_isa.h}                                |  68 ++++++++----
 .../std/detail/libcxx/include/__cccl_config   |   2 +
 .../detail/libcxx/include/__cuda/barrier.h    | 104 +++++++++++-------
 .../std/detail/libcxx/include/__cuda/ptx.h    |   1 -
 ..._and_communication_instructions_mbarrier.h |   9 +-
 .../cuda/std/detail/libcxx/include/version    |  26 ++---
 .../barrier/arrive_tx_cluster.runfail.cpp     |   1 +
 .../cuda/barrier/arrive_tx_cta.pass.cpp       |   1 +
 .../cuda/barrier/arrive_tx_device.runfail.cpp |   1 +
 .../barrier/arrive_tx_feature_test.pass.cpp   |   1 +
 ...arrive_tx_static_asserts_pre_sm90.fail.cpp |   4 +-
 .../cuda/barrier/arrive_tx_thread.pass.cpp    |   1 +
 .../cuda/barrier/arrive_tx_warp.pass.cpp      |   1 +
 .../cuda/barrier/cp_async_bulk.pass.cpp       |   1 +
 .../cp_async_bulk_feature_test.pass.cpp       |   1 +
 .../cp_async_bulk_ptx_compiles.pass.cpp       |   1 +
 .../barrier/cp_async_bulk_tensor.pass.cpp     |   1 +
 .../barrier/cp_async_bulk_tensor_1d.pass.cpp  |   1 +
 .../barrier/cp_async_bulk_tensor_2d.pass.cpp  |   1 +
 .../barrier/cp_async_bulk_tensor_3d.pass.cpp  |   1 +
 .../barrier/cp_async_bulk_tensor_4d.pass.cpp  |   1 +
 .../barrier/cp_async_bulk_tensor_5d.pass.cpp  |   1 +
 .../barrier/cp_async_bulk_tensor_generic.h    |   3 -
 .../cuda/barrier/expect_tx_cta.pass.cpp       |   1 +
 .../cuda/barrier/expect_tx_device.runfail.cpp |   1 +
 .../cuda/barrier/expect_tx_thread.pass.cpp    |   1 +
 .../cuda/barrier/expect_tx_warp.pass.cpp      |   1 +
 .../memcpy_async/memcpy_async_tx.pass.cpp     |   1 +
 29 files changed, 152 insertions(+), 89 deletions(-)
 rename libcudacxx/include/cuda/std/detail/libcxx/include/{__cuda/ptx/ptx_isa_target_macros.h => __cccl/ptx_isa.h} (58%)

diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier
index 15182816c0..3c21d0c9b1 100644
--- a/libcudacxx/include/cuda/barrier
+++ b/libcudacxx/include/cuda/barrier
@@ -40,7 +40,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 // capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
 // necessary to prevent cudafe from ripping out the device functions before
 // device compilation begins.
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
+#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 inline _LIBCUDACXX_DEVICE
@@ -278,7 +278,7 @@ void cp_async_bulk_wait_group_read()
                : "memory");
 }
 
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
similarity index 58%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
rename to libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 5dcccc5d1f..d612c9e4d1 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -1,19 +1,15 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
-
-#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
-#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
-
-#include <nv/target>            // __CUDA_MINIMUM_ARCH__ and friends
+#ifndef __CCCL_PTX_ISA_H_
+#define __CCCL_PTX_ISA_H_
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -23,6 +19,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
 /*
  * Targeting macros
  *
@@ -31,47 +29,75 @@
  */
 
 // PTX ISA 8.3 is available from CUDA 12.3, driver r545
-#if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+// The first define is for future major versions of CUDACC.
+// We make sure that these get the highest known PTX ISA version.
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 830ULL
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
 // PTX ISA 8.2 is available from CUDA 12.2, driver r535
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 820ULL
 // PTX ISA 8.1 is available from CUDA 12.1, driver r530
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 810ULL
 // PTX ISA 8.0 is available from CUDA 12.0, driver r525
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 800ULL
 // PTX ISA 7.8 is available from CUDA 11.8, driver r520
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 780ULL
 // PTX ISA 7.7 is available from CUDA 11.7, driver r515
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 770ULL
 // PTX ISA 7.6 is available from CUDA 11.6, driver r510
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 760ULL
 // PTX ISA 7.5 is available from CUDA 11.5, driver r495
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 750ULL
 // PTX ISA 7.4 is available from CUDA 11.4, driver r470
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 740ULL
 // PTX ISA 7.3 is available from CUDA 11.3, driver r465
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 730ULL
 // PTX ISA 7.2 is available from CUDA 11.2, driver r460
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 720ULL
 // PTX ISA 7.1 is available from CUDA 11.1, driver r455
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 710ULL
 // PTX ISA 7.0 is available from CUDA 11.0, driver r445
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \
+  || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 700ULL
 // Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
 #else
 #  define __cccl_ptx_isa 0ULL
 #endif
 
-#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
+// We define certain feature test macros depending on availability. When
+// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
+// depending on PTX ISA. This permits checking for the feature in host code.
+// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
+// hardware supports it.
+#if (!defined(__CUDA_MINIMUM_ARCH__)) \
+  || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_ptx_isa >= 800
+#  define __cccl_lib_local_barrier_arrive_tx
+#  define __cccl_lib_experimental_ctk12_cp_async_exposure
+#endif
+
+#endif // __CCCL_PTX_ISA_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
index afdabbbb11..c1d2a381a5 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config
@@ -187,6 +187,8 @@
 #  define _CCCL_NV_DIAG_DEFAULT(_WARNING)
 #endif // other compilers
 
+#include "__cccl/ptx_isa.h"
+#include "__cccl/version.h"
 #include "__cccl/visibility.h"
 
 #endif // __CCCL_CONFIG
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index d73d1c2dbb..d5015594f6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -572,11 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>
 
 #if defined(_CCCL_CUDA_COMPILER)
 
-// Hide arrive_tx when CUDA architecture is insufficient. Note the
-// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function
-// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__.
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     barrier<thread_scope_block> & __b,
@@ -591,7 +588,7 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
 
     barrier<thread_scope_block>::arrival_token __token = {};
-    NV_IF_TARGET(
+    NV_IF_ELSE_TARGET(
         // On architectures pre-sm90, arrive_tx is not supported.
         NV_PROVIDES_SM_90, (
             // We do not check for the statespace of the barrier here. This is
@@ -619,11 +616,47 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
                     _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
                 );
             }
+        ),(
+            __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
         )
     );
     return __token;
 }
 
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
+_LIBCUDACXX_DEVICE inline
+void barrier_expect_tx(
+    barrier<thread_scope_block> & __b,
+    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
+
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+    // We do not check for the statespace of the barrier here. This is
+    // on purpose. This allows debugging tools like memcheck/racecheck
+    // to detect that we are passing a pointer with the wrong state
+    // space to mbarrier.arrive. If we checked for the state space here,
+    // and __trap() if wrong, then those tools would not be able to help
+    // us in release builds. In debug builds, the error would be caught
+    // by the asserts at the top of this function.
+    NV_IF_ELSE_TARGET(
+        // On architectures pre-sm90, arrive_tx is not supported.
+        NV_PROVIDES_SM_90, (
+    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+    asm (
+        "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+        :
+        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+          "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+        : "memory");
+    ),(
+        __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
+    ));
+}
+
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
 template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
 _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _Tp* __dest,
@@ -643,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
     _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
 
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
     if (__isShared(__dest) && __isGlobal(__src)) {
         asm volatile(
@@ -660,36 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
         // is not yet implemented. So we trap in this case as well.
         _LIBCUDACXX_UNREACHABLE();
     }
+  ),(
+    __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
+  ));
 
     return async_contract_fulfillment::async;
 }
-
-_LIBCUDACXX_DEVICE inline
-void barrier_expect_tx(
-    barrier<thread_scope_block> & __b,
-    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
-
-    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
-    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
-    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
-
-    // We do not check for the statespace of the barrier here. This is
-    // on purpose. This allows debugging tools like memcheck/racecheck
-    // to detect that we are passing a pointer with the wrong state
-    // space to mbarrier.arrive. If we checked for the state space here,
-    // and __trap() if wrong, then those tools would not be able to help
-    // us in release builds. In debug builds, the error would be caught
-    // by the asserts at the top of this function.
-    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
-    asm (
-        "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-        : "memory");
-}
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_ptx_isa >= 800
 #endif // _CCCL_CUDA_COMPILER
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
@@ -763,6 +774,7 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD
 // The user is still responsible for arriving and waiting on (or otherwise
 // synchronizing with) the barrier or pipeline barrier to see the results of
 // copies from other threads participating in the synchronization object.
+extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
 struct __memcpy_completion_impl {
 
     template<typename _Group>
@@ -796,13 +808,17 @@ struct __memcpy_completion_impl {
                 // bulk group to be used with shared memory barriers.
                 _LIBCUDACXX_UNREACHABLE();
             case __completion_mechanism::__mbarrier_complete_tx:
+#if __cccl_ptx_isa >= 800
                 // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
                 NV_IF_TARGET(NV_PROVIDES_SM_90, (
                     // Only perform the expect_tx operation with the leader thread
                     if (__group.thread_rank() == 0) {
                         ::cuda::device::barrier_expect_tx(__barrier, __size);
                     }
+                ),(
+                    __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
                 ));
+#endif // __cccl_ptx_isa >= 800
                 return async_contract_fulfillment::async;
             case __completion_mechanism::__sync:
                 // sync: In this case, we do not need to do anything. The user will have
@@ -929,11 +945,13 @@ struct __memcpy_completion_impl {
  * 5. normal synchronous copy (fallback)
  ***********************************************************************/
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
 template <typename _Group>
 inline __device__
 void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if (__g.thread_rank() == 0) {
         asm volatile(
             "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
@@ -944,10 +962,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
               "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle)))
             : "memory");
     }
+  ),(
+    __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_ptx_isa >= 800
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
 template <size_t _Copy_size>
 inline __device__
 void __cp_async_shared_global(char * __dest, const char * __src) {
@@ -959,6 +980,7 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
     static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16.");
 #endif // _LIBCUDACXX_STD_VER >= 17
 
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
     asm volatile(
         "cp.async.ca.shared.global [%0], [%1], %2, %2;"
         :
@@ -966,6 +988,9 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
           "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
           "n"(_Copy_size)
         : "memory");
+  ),(
+    __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
+  ));
 }
 
 template <>
@@ -973,6 +998,7 @@ inline __device__
 void __cp_async_shared_global<16>(char * __dest, const char * __src) {
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
     // When copying 16 bytes, it is possible to skip L1 cache (.cg).
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
     asm volatile(
         "cp.async.cg.shared.global [%0], [%1], %2, %2;"
         :
@@ -980,6 +1006,9 @@ void __cp_async_shared_global<16>(char * __dest, const char * __src) {
           "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
           "n"(16)
         : "memory");
+  ),(
+    __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
+  ));
 }
 
 template <size_t _Alignment, typename _Group>
@@ -1002,7 +1031,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char *
         __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
     }
 }
-#endif // __CUDA_MINIMUM_ARCH__
 
 template <size_t _Copy_size>
 struct __copy_chunk {
@@ -1083,6 +1111,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
 template<_CUDA_VSTD::size_t _Align, typename _Group>
 _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
 __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) {
+#if __cccl_ptx_isa >= 800
     NV_IF_TARGET(NV_PROVIDES_SM_90, (
         const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
         _LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx.");
@@ -1094,6 +1123,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
         }
         // Fallthrough to SM 80..
     ));
+#endif // __cccl_ptx_isa >= 800
 
     NV_IF_TARGET(NV_PROVIDES_SM_80, (
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 61d904fb5f..20ffafde10 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -26,7 +26,6 @@
 
 #include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
 
-#include "../__cuda/ptx/ptx_isa_target_macros.h"
 #include "../__cuda/ptx/ptx_dot_variants.h"
 #include "../__cuda/ptx/ptx_helper_functions.h"
 #include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 6b42a15128..6f7f1d7358 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -12,11 +12,6 @@
 #ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 #define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 
-#include "ptx_dot_variants.h"
-#include "ptx_helper_functions.h"
-#include "ptx_isa_target_macros.h"
-#include "../../cstdint"
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -25,6 +20,10 @@
 #  pragma system_header
 #endif // no system header
 
+#include "ptx_dot_variants.h"
+#include "ptx_helper_functions.h"
+#include "../../cstdint"
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version
index 449993d7ef..721044b9f1 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/version
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version
@@ -203,6 +203,14 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #include <__config>
 #endif // __cuda_std__
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 // If we are replacing the stl we need to define our own feature test macros.
 // However, if we are in __cuda_std__ we need to define our own symbols to not
 // conflict with the host stl.
@@ -215,24 +223,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #endif
 #endif // __cuda_std__
 
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-// We define certain feature test macros depending on availability. When
-// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
-// unconditionally. This permits checking for the feature in host code. When
-// __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
-// hardware supports it.
-#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
-# define __cccl_lib_local_barrier_arrive_tx
-# define __cccl_lib_experimental_ctk12_cp_async_exposure
-#endif
-
 // We unconditionally define `__cccl_lib_meow` so that there is only one place to set the value
 #if _LIBCUDACXX_STD_VER > 11
 # define __cccl_lib_chrono_udls                          201304L
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
index 8f82188e41..69ed6bb41c 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
index 0d2f5ca841..55038a41f1 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cta.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
index bcf6dcf622..eded8d4e2a 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
index 1a40fcb78a..8729995fef 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_feature_test.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
index b31278ead5..58b84d97de 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_static_asserts_pre_sm90.fail.cpp
@@ -28,9 +28,9 @@ int main(int, char**){
             // barrier_arrive_tx should fail on SM70 and SM80, because it is hidden.
             auto token = cuda::device::barrier_arrive_tx(bar, 1, 0);
 
-#if defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__
+#ifdef __cccl_lib_local_barrier_arrive_tx
             static_assert(false, "Fail manually for SM90 and up.");
-#endif // __CUDA_MINIMUM_ARCH__
+#endif // __cccl_lib_local_barrier_arrive_tx
     ));
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
index a6ac3803bd..8dcb6645a3 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_thread.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
index 4cc310d958..14aa35116a 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_warp.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
index d32aea8572..db061cb4f6 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
index e98ee5e6aa..72b0667fbf 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_feature_test.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
index eb4e9b876d..8b77ee210b 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_ptx_compiles.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
index fb58b58315..7d68e887de 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
index 6d890edc9b..a289c71c2d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
index e394515666..26e04f8d96 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
index b10c2bfc26..88f45093c4 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
index 8e7886db6a..0ba064cf93 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
index 8be14c21c1..af1f27454d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 // UNSUPPORTED: nvrtc
 // NVRTC_SKIP_KERNEL_RUN // This will have effect once PR 433 is merged (line above should be removed.)
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
index 8d29fa6df1..6ac6e40db1 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -7,9 +7,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: libcpp-has-no-threads
-// UNSUPPORTED: pre-sm-90
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
index 771eb6fde7..57b871e424 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_cta.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
index 6da34c14af..c2e36ce997 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
index 5ec98c26ee..9a347e0e7d 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_thread.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
index bf626a902b..b917e9e036 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_warp.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
index 6614bfb51f..bd82a594ac 100644
--- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -10,6 +10,7 @@
 //
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
+// UNSUPPORTED: nvcc-11
 
 // <cuda/barrier>
 

From a4234b24c245c06e19e9dec0bff3268dbfe50bc7 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 21 Feb 2024 11:53:39 +0100
Subject: [PATCH 2/6] Do not use VLAs in `cp_async_bulk_tensor_*` tests

VLAs are a compiler extension and are correctly errored out by some compilers. As we always know the exact size of the array anyway just swtich to a `cuda::std::array`

Fixes nvbug4476664
---
 .../barrier/cp_async_bulk_tensor_1d.pass.cpp  | 19 ++--
 .../barrier/cp_async_bulk_tensor_2d.pass.cpp  | 19 ++--
 .../barrier/cp_async_bulk_tensor_3d.pass.cpp  | 19 ++--
 .../barrier/cp_async_bulk_tensor_4d.pass.cpp  | 19 ++--
 .../barrier/cp_async_bulk_tensor_5d.pass.cpp  | 19 ++--
 .../barrier/cp_async_bulk_tensor_generic.h    | 90 +++++++++----------
 6 files changed, 96 insertions(+), 89 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
index a289c71c2d..dad966ded7 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -17,6 +17,9 @@
 
 // <cuda/barrier>
 
+#include <cuda/barrier>
+#include <cuda/std/array>
+
 #include "cp_async_bulk_tensor_generic.h"
 
 // Define the size of contiguous tensor in global and shared memory.
@@ -26,21 +29,20 @@
 // offset.
 //
 // We have a separate variable for host and device because a constexpr
-// std::initializer_list cannot be shared between host and device as some of its
+// cuda::std::array cannot be shared between host and device as some of its
 // member functions take a const reference, which is unsupported by nvcc.
-           constexpr std::initializer_list<int> GMEM_DIMS    {256};
-__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{256};
-           constexpr std::initializer_list<int> SMEM_DIMS    {32};
-__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{32};
+           constexpr cuda::std::array<uint64_t, 1> GMEM_DIMS    {256};
+__device__ constexpr cuda::std::array<uint64_t, 1> GMEM_DIMS_DEV{256};
+           constexpr cuda::std::array<uint32_t, 1> SMEM_DIMS    {32};
+__device__ constexpr cuda::std::array<uint32_t, 1> SMEM_DIMS_DEV{32};
 
-__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
+__device__ constexpr cuda::std::array<uint32_t, 1> TEST_SMEM_COORDS[] = {
     {0},
     {4},
     {8}
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
-constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -54,9 +56,10 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
+    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
index 26e04f8d96..4d55b4fbbc 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -17,6 +17,9 @@
 
 // <cuda/barrier>
 
+#include <cuda/barrier>
+#include <cuda/std/array>
+
 #include "cp_async_bulk_tensor_generic.h"
 
 // Define the size of contiguous tensor in global and shared memory.
@@ -26,14 +29,14 @@
 // offset.
 //
 // We have a separate variable for host and device because a constexpr
-// std::initializer_list cannot be shared between host and device as some of its
+// cuda::std::array cannot be shared between host and device as some of its
 // member functions take a const reference, which is unsupported by nvcc.
-           constexpr std::initializer_list<int> GMEM_DIMS    {8, 11};
-__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11};
-           constexpr std::initializer_list<int> SMEM_DIMS    {4,  2};
-__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4,  2};
+           constexpr cuda::std::array<uint64_t, 2> GMEM_DIMS    {8, 11};
+__device__ constexpr cuda::std::array<uint64_t, 2> GMEM_DIMS_DEV{8, 11};
+           constexpr cuda::std::array<uint32_t, 2> SMEM_DIMS    {4,  2};
+__device__ constexpr cuda::std::array<uint32_t, 2> SMEM_DIMS_DEV{4,  2};
 
-__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
+__device__ constexpr cuda::std::array<uint32_t, 2> TEST_SMEM_COORDS[] = {
     {0, 0},
     {4, 1},
     {4, 5},
@@ -41,7 +44,6 @@ __device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
-constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -55,9 +57,10 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
+    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
index 88f45093c4..051486be45 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
@@ -17,6 +17,9 @@
 
 // <cuda/barrier>
 
+#include <cuda/barrier>
+#include <cuda/std/array>
+
 #include "cp_async_bulk_tensor_generic.h"
 
 // Define the size of contiguous tensor in global and shared memory.
@@ -26,21 +29,20 @@
 // offset.
 //
 // We have a separate variable for host and device because a constexpr
-// std::initializer_list cannot be shared between host and device as some of its
+// cuda::std::array cannot be shared between host and device as some of its
 // member functions take a const reference, which is unsupported by nvcc.
-           constexpr std::initializer_list<int> GMEM_DIMS    {8, 11, 13};
-__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11, 13};
-           constexpr std::initializer_list<int> SMEM_DIMS    {4,  2,  4};
-__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4,  2,  4};
+           constexpr cuda::std::array<uint64_t, 3> GMEM_DIMS    {8, 11, 13};
+__device__ constexpr cuda::std::array<uint64_t, 3> GMEM_DIMS_DEV{8, 11, 13};
+           constexpr cuda::std::array<uint32_t, 3> SMEM_DIMS    {4,  2,  4};
+__device__ constexpr cuda::std::array<uint32_t, 3> SMEM_DIMS_DEV{4,  2,  4};
 
-__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
+__device__ constexpr cuda::std::array<uint32_t, 3> TEST_SMEM_COORDS[] = {
     {0, 0, 0},
     {4, 1, 3},
     {4, 5, 1}
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
-constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -54,9 +56,10 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
+    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
index 0ba064cf93..78f8da4a87 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
@@ -17,6 +17,9 @@
 
 // <cuda/barrier>
 
+#include <cuda/barrier>
+#include <cuda/std/array>
+
 #include "cp_async_bulk_tensor_generic.h"
 
 // Define the size of contiguous tensor in global and shared memory.
@@ -26,14 +29,14 @@
 // offset.
 //
 // We have a separate variable for host and device because a constexpr
-// std::initializer_list cannot be shared between host and device as some of its
+// cuda::std::array cannot be shared between host and device as some of its
 // member functions take a const reference, which is unsupported by nvcc.
-           constexpr std::initializer_list<int> GMEM_DIMS    {8, 11, 13, 3};
-__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11, 13, 3};
-           constexpr std::initializer_list<int> SMEM_DIMS    {4,  2,  4, 1};
-__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4,  2,  4, 1};
+           constexpr cuda::std::array<uint64_t, 4> GMEM_DIMS    {8, 11, 13, 3};
+__device__ constexpr cuda::std::array<uint64_t, 4> GMEM_DIMS_DEV{8, 11, 13, 3};
+           constexpr cuda::std::array<uint32_t, 4> SMEM_DIMS    {4,  2,  4, 1};
+__device__ constexpr cuda::std::array<uint32_t, 4> SMEM_DIMS_DEV{4,  2,  4, 1};
 
-__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
+__device__ constexpr cuda::std::array<uint32_t, 4> TEST_SMEM_COORDS[] = {
     {0, 0, 0, 0},
     {4, 1, 3, 0},
     {4, 8, 7, 2},
@@ -41,7 +44,6 @@ __device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
-constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -55,9 +57,10 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
+    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
index af1f27454d..5069803d2a 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
@@ -17,6 +17,9 @@
 
 // <cuda/barrier>
 
+#include <cuda/barrier>
+#include <cuda/std/array>
+
 #include "cp_async_bulk_tensor_generic.h"
 
 // Define the size of contiguous tensor in global and shared memory.
@@ -26,21 +29,20 @@
 // offset.
 //
 // We have a separate variable for host and device because a constexpr
-// std::initializer_list cannot be shared between host and device as some of its
+// cuda::std::array cannot be shared between host and device as some of its
 // member functions take a const reference, which is unsupported by nvcc.
-           constexpr std::initializer_list<int> GMEM_DIMS    {8, 11, 13, 3, 3};
-__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11, 13, 3, 3};
-           constexpr std::initializer_list<int> SMEM_DIMS    {4,  2,  4, 1, 1};
-__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4,  2,  4, 1, 1};
+           constexpr cuda::std::array<uint64_t, 5> GMEM_DIMS    {8, 11, 13, 3, 3};
+__device__ constexpr cuda::std::array<uint64_t, 5> GMEM_DIMS_DEV{8, 11, 13, 3, 3};
+           constexpr cuda::std::array<uint32_t, 5> SMEM_DIMS    {4,  2,  4, 1, 1};
+__device__ constexpr cuda::std::array<uint32_t, 5> SMEM_DIMS_DEV{4,  2,  4, 1, 1};
 
-__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
+__device__ constexpr cuda::std::array<uint32_t, 5> TEST_SMEM_COORDS[] = {
     {0, 0, 0, 0, 0},
     {4, 1, 3, 0, 1},
     {4, 5, 1, 1, 2}
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
-constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -54,9 +56,10 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
+    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
index 6ac6e40db1..83be47f005 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -14,6 +14,7 @@
 #define TEST_CP_ASYNC_BULK_TENSOR_GENERIC_H_
 
 #include <cuda/barrier>
+#include <cuda/std/array>
 #include <cuda/std/utility>     // cuda::std::move
 #include "test_macros.h"        // TEST_NV_DIAG_SUPPRESS
 
@@ -37,24 +38,26 @@ namespace cde = cuda::device::experimental;
  */
 
 // Compute the total number of elements in a tensor
-constexpr __host__ __device__ int tensor_len(std::initializer_list<int> dims) {
-    int len = 1;
+template<class T, size_t num_dims>
+constexpr __host__ __device__ int tensor_len(cuda::std::array<T, num_dims> dims) {
+    T len = 1;
     for (int d : dims) {
         len *= d;
     }
-    return len;
+    return static_cast<int>(len);
 }
 
 // Function to convert:
 // a linear index into a shared memory tensor
 // into
 // a linear index into a global memory tensor.
+template<size_t num_dims>
 inline __device__
 int smem_lin_idx_to_gmem_lin_idx(
     int smem_lin_idx,
-    std::initializer_list<int> smem_coord,
-    std::initializer_list<int> smem_dims,
-    std::initializer_list<int> gmem_dims) {
+    cuda::std::array<uint32_t, num_dims> smem_coord,
+    cuda::std::array<uint32_t, num_dims> smem_dims,
+    cuda::std::array<uint64_t, num_dims> gmem_dims) {
     assert(smem_coord.size() == smem_dims.size());
     assert(smem_coord.size() == gmem_dims.size());
 
@@ -70,38 +73,36 @@ int smem_lin_idx_to_gmem_lin_idx(
     return gmem_lin_idx;
 }
 
+template<size_t num_dims>
 __device__ inline void cp_tensor_global_to_shared(
     CUtensorMap* tensor_map,
-    std::initializer_list<int> indices,
+    cuda::std::array<uint32_t, num_dims> indices,
     void *smem,
     barrier &bar) {
 
-    const int* idxs = indices.begin();
-
     switch (indices.size()) {
-        case 1: cde::cp_async_bulk_tensor_1d_global_to_shared(smem, tensor_map, idxs[0], bar); break;
-        case 2: cde::cp_async_bulk_tensor_2d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], bar); break;
-        case 3: cde::cp_async_bulk_tensor_3d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], idxs[2], bar); break;
-        case 4: cde::cp_async_bulk_tensor_4d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], bar); break;
-        case 5: cde::cp_async_bulk_tensor_5d_global_to_shared(smem, tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], idxs[4], bar); break;
+        case 1: cde::cp_async_bulk_tensor_1d_global_to_shared(smem, tensor_map, indices[0], bar); break;
+        case 2: cde::cp_async_bulk_tensor_2d_global_to_shared(smem, tensor_map, indices[0], indices[1], bar); break;
+        case 3: cde::cp_async_bulk_tensor_3d_global_to_shared(smem, tensor_map, indices[0], indices[1], indices[2], bar); break;
+        case 4: cde::cp_async_bulk_tensor_4d_global_to_shared(smem, tensor_map, indices[0], indices[1], indices[2], indices[3], bar); break;
+        case 5: cde::cp_async_bulk_tensor_5d_global_to_shared(smem, tensor_map, indices[0], indices[1], indices[2], indices[3], indices[4], bar); break;
         default:
             assert(false && "Wrong number of dimensions.");
     }
 }
 
+template<size_t num_dims>
 __device__ inline void cp_tensor_shared_to_global(
     CUtensorMap* tensor_map,
-    std::initializer_list<int> indices,
+    cuda::std::array<uint32_t, num_dims> indices,
     void *smem) {
 
-    const int* idxs = indices.begin();
-
     switch (indices.size()) {
-        case 1: cde::cp_async_bulk_tensor_1d_shared_to_global(tensor_map, idxs[0], smem); break;
-        case 2: cde::cp_async_bulk_tensor_2d_shared_to_global(tensor_map, idxs[0], idxs[1], smem); break;
-        case 3: cde::cp_async_bulk_tensor_3d_shared_to_global(tensor_map, idxs[0], idxs[1], idxs[2], smem); break;
-        case 4: cde::cp_async_bulk_tensor_4d_shared_to_global(tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], smem); break;
-        case 5: cde::cp_async_bulk_tensor_5d_shared_to_global(tensor_map, idxs[0], idxs[1], idxs[2], idxs[3], idxs[4], smem); break;
+        case 1: cde::cp_async_bulk_tensor_1d_shared_to_global(tensor_map, indices[0], smem); break;
+        case 2: cde::cp_async_bulk_tensor_2d_shared_to_global(tensor_map, indices[0], indices[1], smem); break;
+        case 3: cde::cp_async_bulk_tensor_3d_shared_to_global(tensor_map, indices[0], indices[1], indices[2], smem); break;
+        case 4: cde::cp_async_bulk_tensor_4d_shared_to_global(tensor_map, indices[0], indices[1], indices[2], indices[3], smem); break;
+        case 5: cde::cp_async_bulk_tensor_5d_shared_to_global(tensor_map, indices[0], indices[1], indices[2], indices[3], indices[4], smem); break;
         default:
             assert(false && "Wrong number of dimensions.");
     }
@@ -129,10 +130,10 @@ __constant__ fake_cutensormap global_fake_tensor_map;
  * 5. It writes the tile back to global memory
  * 6. It checks that all the values in global are properly modified.
  */
-template <int smem_len>
-__device__ void test(std::initializer_list<int> smem_coord,
-                     std::initializer_list<int> smem_dims,
-                     std::initializer_list<int> gmem_dims,
+template <size_t smem_len, size_t num_dims>
+__device__ void test(cuda::std::array<uint32_t, smem_len> smem_coord,
+                     cuda::std::array<uint32_t, num_dims> smem_dims,
+                     cuda::std::array<uint64_t, num_dims> gmem_dims,
                      int* gmem_tensor,
                      int gmem_len)
 {
@@ -208,33 +209,24 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() {
 #endif
 
 #ifndef TEST_COMPILER_NVRTC
-template <typename T>
-CUtensorMap map_encode(T *tensor_ptr, std::initializer_list<int> gmem_dims, std::initializer_list<int> smem_dims) {
+template <typename T, size_t num_dims>
+CUtensorMap map_encode(T *tensor_ptr, const cuda::std::array<uint64_t, num_dims>& gmem_dims, const cuda::std::array<uint32_t, num_dims>& smem_dims) {
     // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
     CUtensorMap tensor_map{};
-    assert(gmem_dims.size() == smem_dims.size());
-    // rank is the number of dimensions of the array.
-    int rank = gmem_dims.size();
-    uint64_t size[rank];
-    for (int i  = 0; i < rank; ++i) {
-        size[i] = gmem_dims.begin()[i];
-    }
+
     // The stride is the number of bytes to traverse from the first element of one row to the next.
     // It must be a multiple of 16.
-    uint64_t stride[rank - 1];
-    int base_stride = sizeof(T);
-    for (int i = 0; i < rank - 1; ++i) {
-        base_stride *= gmem_dims.begin()[i];
+    uint64_t stride[num_dims - 1];
+    uint64_t base_stride = sizeof(T);
+    for (size_t i = 0; i < num_dims - 1; ++i) {
+        base_stride *= gmem_dims[i];
         stride[i] = base_stride;
     }
-    // The box_size is the size of the shared memory buffer that is used as the
-    // destination of a TMA transfer. Casting from int -> uint32_t.
-    const uint32_t *box_size = reinterpret_cast<const uint32_t *>(smem_dims.begin());
 
     // The distance between elements in units of sizeof(element). A stride of 2
     // can be used to load only the real component of a complex-valued tensor, for instance.
-    uint32_t elem_stride[rank]; // = {1, .., 1};
-    for (int i = 0; i < rank; ++i) {
+    uint32_t elem_stride[num_dims]; // = {1, .., 1};
+    for (size_t i = 0; i < num_dims; ++i) {
         elem_stride[i] = 1;
     }
 
@@ -245,11 +237,11 @@ CUtensorMap map_encode(T *tensor_ptr, std::initializer_list<int> gmem_dims, std:
     CUresult res = cuTensorMapEncodeTiled(
         &tensor_map,  // CUtensorMap *tensorMap,
         CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
-        rank,                       // cuuint32_t tensorRank,
+        num_dims,                   // cuuint32_t tensorRank,
         tensor_ptr,                 // void *globalAddress,
-        size,                       // const cuuint64_t *globalDim,
+        gmem_dims.data(),           // const cuuint64_t *globalDim,
         stride,                     // const cuuint64_t *globalStrides,
-        box_size,                   // const cuuint32_t *boxDim,
+        smem_dims.data(),           // const cuuint32_t *boxDim,
         elem_stride,                // const cuuint32_t *elementStrides,
         CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
         CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
@@ -261,8 +253,8 @@ CUtensorMap map_encode(T *tensor_ptr, std::initializer_list<int> gmem_dims, std:
     return tensor_map;
 }
 
-template <typename T>
-void init_tensor_map(const T& gmem_tensor_symbol, std::initializer_list<int> gmem_dims, std::initializer_list<int> smem_dims) {
+template <typename T, size_t num_dims>
+void init_tensor_map(const T& gmem_tensor_symbol, const cuda::std::array<uint64_t, num_dims>& gmem_dims, const cuda::std::array<uint32_t, num_dims>& smem_dims) {
     // Get pointer to gmem_tensor to create tensor map.
     int * tensor_ptr = nullptr;
     auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor_symbol);

From 868d9009891e27784b208f5fb4943cb5db7f69a9 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 21 Feb 2024 14:41:11 +0100
Subject: [PATCH 3/6] Use proper shared memory size

Authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp  | 4 ++--
 .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp  | 4 ++--
 .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp  | 4 ++--
 .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp  | 4 ++--
 .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp  | 4 ++--
 .../libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h    | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
index dad966ded7..e6c3a13818 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -43,6 +43,7 @@ __device__ constexpr cuda::std::array<uint32_t, 1> TEST_SMEM_COORDS[] = {
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
+constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -56,10 +57,9 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
-    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
index 4d55b4fbbc..4f2e21b33e 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -44,6 +44,7 @@ __device__ constexpr cuda::std::array<uint32_t, 2> TEST_SMEM_COORDS[] = {
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
+constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -57,10 +58,9 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
-    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
index 051486be45..925f4084a5 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
@@ -43,6 +43,7 @@ __device__ constexpr cuda::std::array<uint32_t, 3> TEST_SMEM_COORDS[] = {
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
+constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -56,10 +57,9 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
-    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
index 78f8da4a87..60d820561f 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
@@ -44,6 +44,7 @@ __device__ constexpr cuda::std::array<uint32_t, 4> TEST_SMEM_COORDS[] = {
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
+constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -57,10 +58,9 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
-    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
index 5069803d2a..0a395156d4 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
@@ -43,6 +43,7 @@ __device__ constexpr cuda::std::array<uint32_t, 5> TEST_SMEM_COORDS[] = {
 };
 
 constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
+constexpr size_t smem_len = tensor_len(SMEM_DIMS);
 
 __device__ int gmem_tensor[gmem_len];
 
@@ -56,10 +57,9 @@ int main(int, char**)
         ),
         NV_IS_DEVICE, (
             for (auto smem_coord : TEST_SMEM_COORDS) {
-                test(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
             }
         )
     );
-    (void)SMEM_DIMS;
     return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
index 83be47f005..97226b7bfd 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -131,7 +131,7 @@ __constant__ fake_cutensormap global_fake_tensor_map;
  * 6. It checks that all the values in global are properly modified.
  */
 template <size_t smem_len, size_t num_dims>
-__device__ void test(cuda::std::array<uint32_t, smem_len> smem_coord,
+__device__ void test(cuda::std::array<uint32_t, num_dims> smem_coord,
                      cuda::std::array<uint32_t, num_dims> smem_dims,
                      cuda::std::array<uint64_t, num_dims> gmem_dims,
                      int* gmem_tensor,

From 01f2d5f95f43985e5ef5df560916b188f33ad5e8 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 22 Feb 2024 21:10:50 +0100
Subject: [PATCH 4/6] Fix incorrect linker issue

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index d5015594f6..d8710e5f18 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -774,7 +774,6 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD
 // The user is still responsible for arriving and waiting on (or otherwise
 // synchronizing with) the barrier or pipeline barrier to see the results of
 // copies from other threads participating in the synchronization object.
-extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
 struct __memcpy_completion_impl {
 
     template<typename _Group>
@@ -815,8 +814,6 @@ struct __memcpy_completion_impl {
                     if (__group.thread_rank() == 0) {
                         ::cuda::device::barrier_expect_tx(__barrier, __size);
                     }
-                ),(
-                    __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
                 ));
 #endif // __cccl_ptx_isa >= 800
                 return async_contract_fulfillment::async;

From c93c5bcc0f131497c8b32e6c311eaea9cd46c07c Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 22 Feb 2024 21:17:18 +0100
Subject: [PATCH 5/6] Ensure runfail tests do not fail without execution

---
 .../test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp   | 1 +
 .../test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp    | 1 +
 .../test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
index 69ed6bb41c..d534da2bf5 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_cluster.runfail.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvcc-11
+// UNSUPPORTED: no_execute
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
index eded8d4e2a..c7282517c3 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/arrive_tx_device.runfail.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvcc-11
+// UNSUPPORTED: no_execute
 
 // <cuda/barrier>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
index c2e36ce997..36aa9aa8ea 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/expect_tx_device.runfail.cpp
@@ -11,6 +11,7 @@
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvcc-11
+// UNSUPPORTED: no_execute
 
 // <cuda/barrier>
 

From 40ea920f0ef2420d5277788306eacf12b28a2711 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 23 Feb 2024 10:14:37 +0000
Subject: [PATCH 6/6] Ensure that __cccl_ptx_isa properly guards feature flags

---
 .../include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index d612c9e4d1..68e218f29f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -94,10 +94,12 @@
 // depending on PTX ISA. This permits checking for the feature in host code.
 // When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
 // hardware supports it.
+#if __cccl_ptx_isa >= 800
 #if (!defined(__CUDA_MINIMUM_ARCH__)) \
-  || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_ptx_isa >= 800
+  || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
 #  define __cccl_lib_local_barrier_arrive_tx
 #  define __cccl_lib_experimental_ctk12_cp_async_exposure
 #endif
+#endif // __cccl_ptx_isa >= 800
 
 #endif // __CCCL_PTX_ISA_H_