NVIDIA · miscco · Jan 22, 2025 · Jan 22, 2025
@@ -26,6 +26,8 @@
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
+#if _CCCL_HAS_CUDA_COMPILER
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 template <int __n>
@@ -103,23 +105,25 @@ inline _CCCL_DEVICE _Tp* __from_ptr_gmem(_CUDA_VSTD::size_t __ptr)
 template <typename _Tp>
 inline _CCCL_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val)
 {
-#if _CCCL_STD_VER >= 2017
+#  if _CCCL_STD_VER >= 2017
   static_assert(sizeof(_Tp) == 4, "");
-#endif // _CCCL_STD_VER >= 2017
+#  endif // _CCCL_STD_VER >= 2017
   // Consider using std::bitcast
   return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val);
 }
 
 template <typename _Tp>
 inline _CCCL_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val)
 {
-#if _CCCL_STD_VER >= 2017
+#  if _CCCL_STD_VER >= 2017
   static_assert(sizeof(_Tp) == 8, "");
-#endif // _CCCL_STD_VER >= 2017
+#  endif // _CCCL_STD_VER >= 2017
   // Consider using std::bitcast
   return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
+#endif // _CCCL_HAS_CUDA_COMPILER
+
 #endif // _CUDA_PTX_HELPER_FUNCTIONS_H_
@@ -21,11 +21,12 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbytes) noexcept
+inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, _CUDA_VSTD::size_t __nbytes) noexcept
 {
   // The discard PTX instruction is only available with PTX ISA 7.4 and later
 #if __cccl_ptx_isa < 740ULL

@@ -141,6 +141,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__memcpy_async/completion_mechanism.h>
+#include <cuda/__memcpy_async/memcpy_async_barrier.h>
 #include <cuda/atomic>
 #include <cuda/barrier>
 #include <cuda/std/chrono>

@@ -22,10 +22,6 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CUDA_COMPILER(CLANG)
-#  include <cuda_runtime_api.h>
-#endif // _CCCL_CUDA_COMPILER(CLANG)
-
 #include <cuda/std/__exception/terminate.h>
 
 #if !_CCCL_COMPILER(NVRTC)
@@ -40,8 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 /**
  * @brief Exception thrown when a CUDA error is encountered.
  */
-#if _CCCL_HAS_CUDA_COMPILER
-#  ifndef _CCCL_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 class cuda_error : public ::std::runtime_error
 {
 private:
@@ -50,37 +45,36 @@ class cuda_error : public ::std::runtime_error
     char __buffer[256];
   };
 
-  static char* __format_cuda_error(::cudaError_t __status, const char* __msg, char* __msg_buffer) noexcept
+  static char* __format_cuda_error(const int __status, const char* __msg, char* __msg_buffer) noexcept
   {
     ::snprintf(__msg_buffer, 256, "cudaError %d: %s", __status, __msg);
     return __msg_buffer;
   }
 
 public:
-  cuda_error(::cudaError_t __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept
+  cuda_error(const int __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept
       : ::std::runtime_error(__format_cuda_error(__status, __msg, __msg_buffer.__buffer))
   {}
 };
 
-_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t __status, const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int __status, const char* __msg)
 {
   NV_IF_ELSE_TARGET(NV_IS_HOST,
                     (throw ::cuda::cuda_error(__status, __msg);),
                     ((void) __status; (void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
 }
-#  else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
 class cuda_error
 {
 public:
-  _LIBCUDACXX_HIDE_FROM_ABI cuda_error(::cudaError_t, const char*) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI cuda_error(const int, const char*) noexcept {}
 };
 
-_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t, const char*)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int, const char*)
 {
   _CUDA_VSTD_NOVERSION::terminate();
 }
-#  endif // _CCCL_NO_EXCEPTIONS
-#endif // _CCCL_CUDA_COMPILER
+#endif // _CCCL_NO_EXCEPTIONS
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 

@@ -129,6 +129,8 @@
  * (v. August 20, 2021)
  */
 
+#include <cuda_runtime_api.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 namespace __detail_ap

@@ -137,15 +137,16 @@ namespace __detail_ap
 template <typename _Property>
 _CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
 {
+#if _CCCL_HAS_CUDA_COMPILER
   if (std::is_same<_Property, access_property::shared>::value == true)
   {
     bool __b = __isShared(__ptr);
     _CCCL_ASSERT(__b, "");
-#if defined(_CCCL_BUILTIN_ASSUME)
+#  if defined(_CCCL_BUILTIN_ASSUME)
     _CCCL_BUILTIN_ASSUME(__b);
-#else // ^^^ _CCCL_BUILTIN_ASSUME ^^^ / vvv !_CCCL_BUILTIN_ASSUME vvv
+#  else // ^^^ _CCCL_BUILTIN_ASSUME ^^^ / vvv !_CCCL_BUILTIN_ASSUME vvv
     (void) __b;
-#endif // !_CCCL_BUILTIN_ASSUME
+#  endif // !_CCCL_BUILTIN_ASSUME
   }
   else if (std::is_same<_Property, access_property::global>::value == true
            || std::is_same<_Property, access_property::normal>::value == true
@@ -155,12 +156,13 @@ _CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
   {
     bool __b = __isGlobal(__ptr);
     _CCCL_ASSERT(__b, "");
-#if defined(_CCCL_BUILTIN_ASSUME)
+#  if defined(_CCCL_BUILTIN_ASSUME)
     _CCCL_BUILTIN_ASSUME(__b);
-#else // ^^^ !_CCCL_BUILTIN_ASSUME ^^^ / vvv _CCCL_BUILTIN_ASSUME vvv
+#  else // ^^^ !_CCCL_BUILTIN_ASSUME ^^^ / vvv _CCCL_BUILTIN_ASSUME vvv
     (void) __b;
-#endif // !_CCCL_BUILTIN_ASSUME
+#  endif // !_CCCL_BUILTIN_ASSUME
   }
+#endif // _CCCL_HAS_CUDA_COMPILER
 
   return __ptr;
 }

@@ -38,9 +38,6 @@ private:
 }  // cuda
 */
 
-#include <cuda_runtime_api.h>
-// cuda_runtime_api needs to come first
-
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -51,6 +48,8 @@ private:
 #  pragma system_header
 #endif // no system header
 
+#include <cuda_runtime_api.h>
+
 #include <cuda/std/__cuda/api_wrapper.h>
 #include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/cstddef>

@@ -2,12 +2,18 @@
 # without anything else but also pretents to be a std header
 add_custom_target(libcudacxx.test.public_headers_host_only)
 
+if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  find_package(NVHPC)
+else()
+  find_package(CUDAToolkit)
+endif()
+
 # Grep all public headers
 file(GLOB public_headers_host_only
   LIST_DIRECTORIES false
   RELATIVE "${libcudacxx_SOURCE_DIR}/include/"
   CONFIGURE_DEPENDS
-  "${libcudacxx_SOURCE_DIR}/include/cuda/std/*"
+  "${libcudacxx_SOURCE_DIR}/include/cuda/*"
 )
 
 # mdspan is currently not supported on msvc outside of C++20
@@ -34,6 +40,13 @@ function(libcudacxx_add_std_header_test header)
   endif()
   target_compile_definitions(headertest_std_${header_name} PRIVATE CCCL_IGNORE_DEPRECATED_CPP_DIALECT)
 
+  # We want to ensure that we can build headers within <cuda/> with a host compiler but we need cuda_runtime_api.h
+  if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    target_link_libraries(headertest_std_${header_name} NVHPC::CUDART)
+  else()
+    target_link_libraries(headertest_std_${header_name} CUDA::cudart)
+  endif()
+
   add_dependencies(libcudacxx.test.public_headers_host_only headertest_std_${header_name})
 endfunction()