Unify assert handling in cccl (NVIDIA#2382)

We currently do not have proper assertions within CCCL. There are different approaches in cub thrust and libcu++, some of which are completely broken. This tries to rework the assertion handlers so that they work uniformly everywhere and can be selectively enabled.
gevtushenko · Oct 3, 2024 · 3eee9b2 · 3eee9b2
1 parent 5e139af
commit 3eee9b2
Show file tree

Hide file tree

Showing 105 changed files with 727 additions and 858 deletions.
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -168,7 +168,7 @@ template <typename Integral>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Integral x, Integral mult) -> Integral
 {
 #if _CCCL_STD_VER > 2011
-  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t<Integral>>(mult)), "");
+  _CCCL_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t<Integral>>(mult)), "");
 #endif // _CCCL_STD_VER > 2011
   return (x + mult - 1) & ~(mult - 1);
 }
@@ -177,7 +177,7 @@ template <typename T>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
 {
 #if _CCCL_STD_VER > 2011
-  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(alignment), "");
+  _CCCL_ASSERT(::cuda::std::has_single_bit(alignment), "");
 #endif // _CCCL_STD_VER > 2011
   return reinterpret_cast<const char*>(
     reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
@@ -278,8 +278,8 @@ _CCCL_DEVICE void bulk_copy_tile(
 
   const char* src = aligned_ptr.ptr + global_offset * sizeof(T);
   char* dst       = smem + smem_offset;
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
 
   // TODO(bgruber): we could precompute bytes_to_copy on the host
   const int bytes_to_copy = round_up_to_po2_multiple(
@@ -303,8 +303,8 @@ _CCCL_DEVICE void bulk_copy_tile_fallback(
 {
   const T* src = aligned_ptr.ptr_to_elements() + global_offset;
   T* dst       = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
 
   const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
   cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);

diff --git a/cudax/include/cuda/experimental/__async/config.cuh b/cudax/include/cuda/experimental/__async/config.cuh
@@ -28,7 +28,7 @@ namespace cuda::experimental::__async
 // Debuggers do not step into functions marked with __attribute__((__artificial__)).
 // This is useful for small wrapper functions that just dispatch to other functions and
 // that are inlined into the caller.
-#if __has_attribute(__artificial__) && !defined(__CUDACC__)
+#if _CCCL_HAS_ATTRIBUTE(__artificial__) && !defined(__CUDACC__)
 #  define _CUDAX_ARTIFICIAL __attribute__((__artificial__))
 #else
 #  define _CUDAX_ARTIFICIAL

diff --git a/cudax/include/cuda/experimental/__async/stop_token.cuh b/cudax/include/cuda/experimental/__async/stop_token.cuh
@@ -303,8 +303,8 @@ _CCCL_HOST_DEVICE inline void __inplace_stop_callback_base::__register_callback(
 
 _CCCL_HOST_DEVICE inline inplace_stop_source::~inplace_stop_source()
 {
-  _LIBCUDACXX_ASSERT((__state_.load(_CUDA_VSTD::memory_order_relaxed) & __locked_flag) == 0, "");
-  _LIBCUDACXX_ASSERT(__callbacks_ == nullptr, "");
+  _CCCL_ASSERT((__state_.load(_CUDA_VSTD::memory_order_relaxed) & __locked_flag) == 0, "");
+  _CCCL_ASSERT(__callbacks_ == nullptr, "");
 }
 
 _CCCL_HOST_DEVICE inline auto inplace_stop_source::request_stop() noexcept -> bool

diff --git a/cudax/include/cuda/experimental/__async/variant.cuh b/cudax/include/cuda/experimental/__async/variant.cuh
@@ -139,7 +139,7 @@ public:
   {
     // make this local in case destroying the sub-object destroys *this
     const auto index = __self.__index_;
-    _LIBCUDACXX_ASSERT(index != __npos, "");
+    _CCCL_ASSERT(index != __npos, "");
     ((_Idx == index
         ? static_cast<_Fn&&>(__fn)(static_cast<_As&&>(__as)..., static_cast<_Self&&>(__self).template __get<_Idx>())
         : void()),
@@ -149,21 +149,21 @@ public:
   template <size_t _Ny>
   _CCCL_HOST_DEVICE __at<_Ny>&& __get() && noexcept
   {
-    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    _CCCL_ASSERT(_Ny == __index_, "");
     return static_cast<__at<_Ny>&&>(*static_cast<__at<_Ny>*>(__ptr()));
   }
 
   template <size_t _Ny>
   _CCCL_HOST_DEVICE __at<_Ny>& __get() & noexcept
   {
-    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    _CCCL_ASSERT(_Ny == __index_, "");
     return *static_cast<__at<_Ny>*>(__ptr());
   }
 
   template <size_t _Ny>
   _CCCL_HOST_DEVICE const __at<_Ny>& __get() const& noexcept
   {
-    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    _CCCL_ASSERT(_Ny == __index_, "");
     return *static_cast<const __at<_Ny>*>(__ptr());
   }
 };

diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
@@ -154,7 +154,7 @@ public:
           nullptr, _CUDA_VSTD::exchange(__other.__static_vtable, nullptr))
       , __vtable(__other)
   {
-    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    _CCCL_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
     this->__static_vtable->__move_fn(&this->__object, &__other.__object);
   }
 
@@ -165,7 +165,7 @@ public:
           nullptr, _CUDA_VSTD::exchange(__other.__static_vtable, nullptr))
       , __vtable(__other)
   {
-    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    _CCCL_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
     this->__static_vtable->__move_fn(&this->__object, &__other.__object);
   }
 
@@ -194,7 +194,7 @@ public:
       : _CUDA_VMR::_Resource_base<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning>(nullptr, __other.__static_vtable)
       , __vtable(__other)
   {
-    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    _CCCL_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
     this->__static_vtable->__copy_fn(&this->__object, &__other.__object);
   }
 

diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh
@@ -164,8 +164,7 @@ public:
   //! properly synchronize all relevant streams before calling `deallocate`.
   void deallocate(void* __ptr, const size_t, const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment)
   {
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to async_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to async_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(
       ::cudaFreeAsync, "async_memory_resource::deallocate failed", __ptr, __async_memory_resource_sync_stream().get());
     __async_memory_resource_sync_stream().wait();
@@ -221,8 +220,7 @@ public:
   void deallocate_async(void* __ptr, const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to async_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to async_memory_resource::deallocate.");
     deallocate_async(__ptr, __bytes, __stream);
     (void) __alignment;
   }

diff --git a/libcudacxx/include/cuda/__cccl_config b/libcudacxx/include/cuda/__cccl_config
@@ -11,6 +11,7 @@
 #ifndef _CUDA__CCCL_CONFIG
 #define _CUDA__CCCL_CONFIG
 
+#include <cuda/std/__cccl/assert.h> // IWYU pragma: export
 #include <cuda/std/__cccl/attributes.h> // IWYU pragma: export
 #include <cuda/std/__cccl/builtin.h> // IWYU pragma: export
 #include <cuda/std/__cccl/compiler.h> // IWYU pragma: export

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -29,7 +29,6 @@
 #include <cuda/std/__type_traits/is_unsigned.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/underlying_type.h>
-#include <cuda/std/detail/libcxx/include/__debug>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -44,7 +43,7 @@ template <class _Tp,
           _CUDA_VSTD::__enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
+  _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
   using _UCommon   = _CUDA_VSTD::__make_unsigned_t<_CUDA_VSTD::__common_type_t<_Tp, _Up>>;
   const auto __res = static_cast<_UCommon>(__a) / static_cast<_UCommon>(__b);
   return static_cast<_Tp>(__res + (__res * static_cast<_UCommon>(__b) != static_cast<_UCommon>(__a)));
@@ -61,8 +60,8 @@ template <class _Tp,
           _CUDA_VSTD::__enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
-  _LIBCUDACXX_DEBUG_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
+  _CCCL_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
+  _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
   using _UCommon = _CUDA_VSTD::__make_unsigned_t<_CUDA_VSTD::__common_type_t<_Tp, _Up>>;
   // Due to the precondition `__a >= 0` we can safely cast to unsigned without danger of overflowing
   return static_cast<_Tp>((static_cast<_UCommon>(__a) + static_cast<_UCommon>(__b) - 1) / static_cast<_UCommon>(__b));

diff --git a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
@@ -85,8 +85,7 @@ class device_memory_resource
   void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to device_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to device_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFree, "device_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }

diff --git a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
@@ -51,7 +51,7 @@ class managed_memory_resource
   constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept
       : __flags_(__flags & __available_flags)
   {
-    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource");
+    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource");
   }
 
   //! @brief Allocate CUDA unified memory of size at least \p __bytes.
@@ -80,8 +80,7 @@ class managed_memory_resource
   void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to managed_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to managed_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }

diff --git a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
@@ -53,7 +53,7 @@ class pinned_memory_resource
   constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept
       : __flags_(__flags & __available_flags)
   {
-    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource");
+    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource");
   }
 
   //! @brief Allocate host memory of size at least \p __bytes.
@@ -83,8 +83,7 @@ class pinned_memory_resource
   deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_host_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to pinned_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to pinned_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }

diff --git a/libcudacxx/include/cuda/annotated_ptr b/libcudacxx/include/cuda/annotated_ptr
@@ -359,9 +359,10 @@ public:
   _CCCL_HOST_DEVICE explicit annotated_ptr(pointer __p)
       : __repr(__p)
   {
-    NV_IF_TARGET(
-      NV_IS_DEVICE,
-      (_LIBCUDACXX_DEBUG_ASSERT((std::is_same<_Property, shared>::value && __isShared(__p) || __isGlobal(__p)), "");))
+    NV_IF_TARGET(NV_IS_DEVICE,
+                 (_CCCL_ASSERT((std::is_same<_Property, access_property::shared>::value && __isShared((void*) __p))
+                                 || __isGlobal((void*) __p),
+                               "");))
   }
 
   template <typename _RuntimeProperty>
@@ -378,7 +379,7 @@ public:
         || std::is_same<_RuntimeProperty, access_property::persisting>::value
         || std::is_same<_RuntimeProperty, access_property>::value,
       "This method requires RuntimeProperty=global|normal|streaming|persisting|access_property");
-    NV_IF_TARGET(NV_IS_DEVICE, (_LIBCUDACXX_DEBUG_ASSERT((__isGlobal(__p) == true), "");))
+    NV_IF_TARGET(NV_IS_DEVICE, (_CCCL_ASSERT((__isGlobal((void*) __p) == true), "");))
   }
 
   template <class _TTp, class _Prop>

diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier
@@ -57,9 +57,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
   void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");
+  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _CCCL_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
+  _CCCL_ASSERT(__isGlobal(__src), "Source must be global memory address.");
 
   _CUDA_VPTX::cp_async_bulk(
     _CUDA_VPTX::space_cluster,
@@ -73,9 +73,9 @@ inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size)
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");
+  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _CCCL_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
+  _CCCL_ASSERT(__isShared(__src), "Source must be shared memory address.");
 
   _CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size);
 }

diff --git a/libcudacxx/include/cuda/std/__algorithm/clamp.h b/libcudacxx/include/cuda/std/__algorithm/clamp.h
@@ -21,15 +21,14 @@
 #endif // no system header
 
 #include <cuda/std/__algorithm/comp.h>
-#include <cuda/std/detail/libcxx/include/__assert>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&
 clamp(const _Tp& __v, const _Tp& __lo, const _Tp& __hi, _Compare __comp)
 {
-  _LIBCUDACXX_ASSERT(!__comp(__hi, __lo), "Bad bounds passed to std::clamp");
+  _CCCL_ASSERT(!__comp(__hi, __lo), "Bad bounds passed to std::clamp");
   return __comp(__v, __lo) ? __lo : __comp(__hi, __v) ? __hi : __v;
 }
 

diff --git a/libcudacxx/include/cuda/std/__algorithm/comp_ref_type.h b/libcudacxx/include/cuda/std/__algorithm/comp_ref_type.h
@@ -21,7 +21,6 @@
 #endif // no system header
 
 #include <cuda/std/__utility/declval.h>
-#include <cuda/std/detail/libcxx/include/__debug>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -60,7 +59,7 @@ struct __debug_less
   _CCCL_CONSTEXPR_CXX14 decltype((void) declval<_Compare&>()(declval<_LHS&>(), declval<_RHS&>()))
   __do_compare_assert(int, _LHS& __l, _RHS& __r)
   {
-    _LIBCUDACXX_DEBUG_ASSERT(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering");
+    _CCCL_ASSERT(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering");
     (void) __l;
     (void) __r;
   }
@@ -72,7 +71,7 @@ struct __debug_less
 
 // Pass the comparator by lvalue reference. Or in debug mode, using a
 // debugging wrapper that stores a reference.
-#ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
+#ifdef _CCCL_ENABLE_DEBUG_MODE
 template <class _Comp>
 using __comp_ref_type = __debug_less<_Comp>;
 #else

diff --git a/libcudacxx/include/cuda/std/__algorithm/pop_heap.h b/libcudacxx/include/cuda/std/__algorithm/pop_heap.h
@@ -29,7 +29,6 @@
 #include <cuda/std/__type_traits/is_copy_assignable.h>
 #include <cuda/std/__type_traits/is_copy_constructible.h>
 #include <cuda/std/__utility/move.h>
-#include <cuda/std/detail/libcxx/include/__assert>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -41,7 +40,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __pop_heap(
   typename iterator_traits<_RandomAccessIterator>::difference_type __len)
 {
   // Calling `pop_heap` on an empty range is undefined behavior, but in practice it will be a no-op.
-  _LIBCUDACXX_ASSERT(__len > 0, "The heap given to pop_heap must be non-empty");
+  _CCCL_ASSERT(__len > 0, "The heap given to pop_heap must be non-empty");
 
   __comp_ref_type<_Compare> __comp_ref = __comp;
 

diff --git a/libcudacxx/include/cuda/std/__algorithm/sift_down.h b/libcudacxx/include/cuda/std/__algorithm/sift_down.h
@@ -23,7 +23,6 @@
 #include <cuda/std/__algorithm/iterator_operations.h>
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__utility/move.h>
-#include <cuda/std/detail/libcxx/include/__assert>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -99,7 +98,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __floyd_si
   typename iterator_traits<_RandomAccessIterator>::difference_type __len)
 {
   using difference_type = typename iterator_traits<_RandomAccessIterator>::difference_type;
-  _LIBCUDACXX_ASSERT(__len >= 2, "shouldn't be called unless __len >= 2");
+  _CCCL_ASSERT(__len >= 2, "shouldn't be called unless __len >= 2");
 
   _RandomAccessIterator __hole    = __first;
   _RandomAccessIterator __child_i = __first;

diff --git a/libcudacxx/include/cuda/std/__algorithm/unwrap_iter.h b/libcudacxx/include/cuda/std/__algorithm/unwrap_iter.h
@@ -52,8 +52,6 @@ struct __unwrap_iter_impl
   }
 };
 
-#ifndef _LIBCUDACXX_ENABLE_DEBUG_MODE
-
 // It's a contiguous iterator, so we can use a raw pointer instead
 template <class _Iter>
 struct __unwrap_iter_impl<_Iter, true>
@@ -71,8 +69,6 @@ struct __unwrap_iter_impl<_Iter, true>
   }
 };
 
-#endif // !_LIBCUDACXX_ENABLE_DEBUG_MODE
-
 template <class _Iter,
           class _Impl                                             = __unwrap_iter_impl<_Iter>,
           __enable_if_t<is_copy_constructible<_Iter>::value, int> = 0>