NVIDIA · miscco · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
@@ -168,7 +168,7 @@ template <typename Integral>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Integral x, Integral mult) -> Integral
 {
 #if _CCCL_STD_VER > 2011
-  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t<Integral>>(mult)), "");
+  _CCCL_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t<Integral>>(mult)), "");
 #endif // _CCCL_STD_VER > 2011
   return (x + mult - 1) & ~(mult - 1);
 }
@@ -177,7 +177,7 @@ template <typename T>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
 {
 #if _CCCL_STD_VER > 2011
-  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(alignment), "");
+  _CCCL_ASSERT(::cuda::std::has_single_bit(alignment), "");
 #endif // _CCCL_STD_VER > 2011
   return reinterpret_cast<const char*>(
     reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
@@ -278,8 +278,8 @@ _CCCL_DEVICE void bulk_copy_tile(
 
   const char* src = aligned_ptr.ptr + global_offset * sizeof(T);
   char* dst       = smem + smem_offset;
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
 
   // TODO(bgruber): we could precompute bytes_to_copy on the host
   const int bytes_to_copy = round_up_to_po2_multiple(
@@ -303,8 +303,8 @@ _CCCL_DEVICE void bulk_copy_tile_fallback(
 {
   const T* src = aligned_ptr.ptr_to_elements() + global_offset;
   T* dst       = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
-  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
 
   const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
   cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);

@@ -303,8 +303,8 @@ _CCCL_HOST_DEVICE inline void __inplace_stop_callback_base::__register_callback(
 
 _CCCL_HOST_DEVICE inline inplace_stop_source::~inplace_stop_source()
 {
-  _LIBCUDACXX_ASSERT((__state_.load(_CUDA_VSTD::memory_order_relaxed) & __locked_flag) == 0, "");
-  _LIBCUDACXX_ASSERT(__callbacks_ == nullptr, "");
+  _CCCL_ASSERT((__state_.load(_CUDA_VSTD::memory_order_relaxed) & __locked_flag) == 0, "");
+  _CCCL_ASSERT(__callbacks_ == nullptr, "");
 }
 
 _CCCL_HOST_DEVICE inline auto inplace_stop_source::request_stop() noexcept -> bool

@@ -139,7 +139,7 @@ public:
   {
     // make this local in case destroying the sub-object destroys *this
     const auto index = __self.__index_;
-    _LIBCUDACXX_ASSERT(index != __npos, "");
+    _CCCL_ASSERT(index != __npos, "");
     ((_Idx == index
         ? static_cast<_Fn&&>(__fn)(static_cast<_As&&>(__as)..., static_cast<_Self&&>(__self).template __get<_Idx>())
         : void()),
@@ -149,21 +149,21 @@ public:
   template <size_t _Ny>
   _CCCL_HOST_DEVICE __at<_Ny>&& __get() && noexcept
   {
-    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    _CCCL_ASSERT(_Ny == __index_, "");
     return static_cast<__at<_Ny>&&>(*static_cast<__at<_Ny>*>(__ptr()));
   }
 
   template <size_t _Ny>
   _CCCL_HOST_DEVICE __at<_Ny>& __get() & noexcept
   {
-    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    _CCCL_ASSERT(_Ny == __index_, "");
     return *static_cast<__at<_Ny>*>(__ptr());
   }
 
   template <size_t _Ny>
   _CCCL_HOST_DEVICE const __at<_Ny>& __get() const& noexcept
   {
-    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    _CCCL_ASSERT(_Ny == __index_, "");
     return *static_cast<const __at<_Ny>*>(__ptr());
   }
 };

@@ -151,7 +151,7 @@ public:
           nullptr, _CUDA_VSTD::exchange(__other.__static_vtable, nullptr))
       , __vtable(__other)
   {
-    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    _CCCL_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
     this->__static_vtable->__move_fn(&this->__object, &__other.__object);
   }
 
@@ -162,7 +162,7 @@ public:
           nullptr, _CUDA_VSTD::exchange(__other.__static_vtable, nullptr))
       , __vtable(__other)
   {
-    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    _CCCL_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
     this->__static_vtable->__move_fn(&this->__object, &__other.__object);
   }
 
@@ -191,7 +191,7 @@ public:
       : _CUDA_VMR::_Resource_base<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning>(nullptr, __other.__static_vtable)
       , __vtable(__other)
   {
-    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    _CCCL_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
     this->__static_vtable->__copy_fn(&this->__object, &__other.__object);
   }
 

@@ -164,8 +164,7 @@ public:
   //! properly synchronize all relevant streams before calling `deallocate`.
   void deallocate(void* __ptr, const size_t, const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment)
   {
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to async_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to async_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(
       ::cudaFreeAsync, "async_memory_resource::deallocate failed", __ptr, __async_memory_resource_sync_stream().get());
     __async_memory_resource_sync_stream().wait();
@@ -221,8 +220,7 @@ public:
   void deallocate_async(void* __ptr, const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to async_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to async_memory_resource::deallocate.");
     deallocate_async(__ptr, __bytes, __stream);
     (void) __alignment;
   }

@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__cccl/assert.h>
 #include <cuda/std/__type_traits/common_type.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_enum.h>
@@ -29,7 +30,6 @@
 #include <cuda/std/__type_traits/is_unsigned.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/underlying_type.h>
-#include <cuda/std/detail/libcxx/include/__debug>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -44,7 +44,7 @@ template <class _Tp,
           _CUDA_VSTD::__enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
+  _CCCL_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
   using _UCommon   = _CUDA_VSTD::__make_unsigned_t<_CUDA_VSTD::__common_type_t<_Tp, _Up>>;
   const auto __res = static_cast<_UCommon>(__a) / static_cast<_UCommon>(__b);
   return static_cast<_Tp>(__res + (__res * static_cast<_UCommon>(__b) != static_cast<_UCommon>(__a)));
@@ -61,8 +61,8 @@ template <class _Tp,
           _CUDA_VSTD::__enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
-  _LIBCUDACXX_DEBUG_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
+  _CCCL_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
+  _CCCL_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
   using _UCommon = _CUDA_VSTD::__make_unsigned_t<_CUDA_VSTD::__common_type_t<_Tp, _Up>>;
   // Due to the precondition `__a >= 0` we can safely cast to unsigned without danger of overflowing
   return static_cast<_Tp>((static_cast<_UCommon>(__a) + static_cast<_UCommon>(__b) - 1) / static_cast<_UCommon>(__b));

@@ -85,8 +85,7 @@ class device_memory_resource
   void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to device_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to device_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFree, "device_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }

@@ -51,7 +51,7 @@ class managed_memory_resource
   constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept
       : __flags_(__flags & __available_flags)
   {
-    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource");
+    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource");
   }
 
   //! @brief Allocate CUDA unified memory of size at least \p __bytes.
@@ -80,8 +80,7 @@ class managed_memory_resource
   void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to managed_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to managed_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }

@@ -53,7 +53,7 @@ class pinned_memory_resource
   constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept
       : __flags_(__flags & __available_flags)
   {
-    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource");
+    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource");
   }
 
   //! @brief Allocate host memory of size at least \p __bytes.
@@ -83,8 +83,7 @@ class pinned_memory_resource
   deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_host_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
-    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to pinned_memory_resource::deallocate.");
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to pinned_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }

@@ -144,6 +144,7 @@
 
 #include <cuda/barrier>
 #include <cuda/discard_memory>
+#include <cuda/std/__cccl/assert.h> // all public C++ headers provide the assertion handler
 #include <cuda/std/cstdint>
 #include <cuda/std/detail/__access_property>
 
@@ -359,9 +360,10 @@ public:
   _CCCL_HOST_DEVICE explicit annotated_ptr(pointer __p)
       : __repr(__p)
   {
-    NV_IF_TARGET(
-      NV_IS_DEVICE,
-      (_LIBCUDACXX_DEBUG_ASSERT((std::is_same<_Property, shared>::value && __isShared(__p) || __isGlobal(__p)), "");))
+    NV_IF_TARGET(NV_IS_DEVICE,
+                 (_CCCL_ASSERT((std::is_same<_Property, access_property::shared>::value && __isShared((void*) __p))
+                                 || __isGlobal((void*) __p),
+                               "");))
   }
 
   template <typename _RuntimeProperty>
@@ -378,7 +380,7 @@ public:
         || std::is_same<_RuntimeProperty, access_property::persisting>::value
         || std::is_same<_RuntimeProperty, access_property>::value,
       "This method requires RuntimeProperty=global|normal|streaming|persisting|access_property");
-    NV_IF_TARGET(NV_IS_DEVICE, (_LIBCUDACXX_DEBUG_ASSERT((__isGlobal(__p) == true), "");))
+    NV_IF_TARGET(NV_IS_DEVICE, (_CCCL_ASSERT((__isGlobal((void*) __p) == true), "");))
   }
 
   template <class _TTp, class _Prop>

@@ -57,9 +57,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
   void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");
+  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _CCCL_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
+  _CCCL_ASSERT(__isGlobal(__src), "Source must be global memory address.");
 
   _CUDA_VPTX::cp_async_bulk(
     _CUDA_VPTX::space_cluster,
@@ -73,9 +73,9 @@ inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size)
 {
-  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
-  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");
+  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _CCCL_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
+  _CCCL_ASSERT(__isShared(__src), "Source must be shared memory address.");
 
   _CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size);
 }

@@ -21,15 +21,15 @@
 #endif // no system header
 
 #include <cuda/std/__algorithm/comp.h>
-#include <cuda/std/detail/libcxx/include/__assert>
+#include <cuda/std/__cccl/assert.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&
 clamp(const _Tp& __v, const _Tp& __lo, const _Tp& __hi, _Compare __comp)
 {
-  _LIBCUDACXX_ASSERT(!__comp(__hi, __lo), "Bad bounds passed to std::clamp");
+  _CCCL_ASSERT(!__comp(__hi, __lo), "Bad bounds passed to std::clamp");
   return __comp(__v, __lo) ? __lo : __comp(__hi, __v) ? __hi : __v;
 }
 

@@ -20,8 +20,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__cccl/assert.h> // all public C++ headers provide the assertion handler
 #include <cuda/std/__utility/declval.h>
-#include <cuda/std/detail/libcxx/include/__debug>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -60,7 +60,7 @@ struct __debug_less
   _CCCL_CONSTEXPR_CXX14 decltype((void) declval<_Compare&>()(declval<_LHS&>(), declval<_RHS&>()))
   __do_compare_assert(int, _LHS& __l, _RHS& __r)
   {
-    _LIBCUDACXX_DEBUG_ASSERT(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering");
+    _CCCL_ASSERT(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering");
     (void) __l;
     (void) __r;
   }
@@ -72,7 +72,7 @@ struct __debug_less
 
 // Pass the comparator by lvalue reference. Or in debug mode, using a
 // debugging wrapper that stores a reference.
-#ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
+#ifdef _CCCL_ENABLE_DEBUG_MODE
 template <class _Comp>
 using __comp_ref_type = __debug_less<_Comp>;
 #else

@@ -25,11 +25,11 @@
 #include <cuda/std/__algorithm/iterator_operations.h>
 #include <cuda/std/__algorithm/push_heap.h>
 #include <cuda/std/__algorithm/sift_down.h>
+#include <cuda/std/__cccl/assert.h>
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__type_traits/is_copy_assignable.h>
 #include <cuda/std/__type_traits/is_copy_constructible.h>
 #include <cuda/std/__utility/move.h>
-#include <cuda/std/detail/libcxx/include/__assert>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -41,7 +41,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __pop_heap(
   typename iterator_traits<_RandomAccessIterator>::difference_type __len)
 {
   // Calling `pop_heap` on an empty range is undefined behavior, but in practice it will be a no-op.
-  _LIBCUDACXX_ASSERT(__len > 0, "The heap given to pop_heap must be non-empty");
+  _CCCL_ASSERT(__len > 0, "The heap given to pop_heap must be non-empty");
 
   __comp_ref_type<_Compare> __comp_ref = __comp;
 

@@ -21,9 +21,9 @@
 #endif // no system header
 
 #include <cuda/std/__algorithm/iterator_operations.h>
+#include <cuda/std/__cccl/assert.h>
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__utility/move.h>
-#include <cuda/std/detail/libcxx/include/__assert>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -99,7 +99,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __floyd_si
   typename iterator_traits<_RandomAccessIterator>::difference_type __len)
 {
   using difference_type = typename iterator_traits<_RandomAccessIterator>::difference_type;
-  _LIBCUDACXX_ASSERT(__len >= 2, "shouldn't be called unless __len >= 2");
+  _CCCL_ASSERT(__len >= 2, "shouldn't be called unless __len >= 2");
 
   _RandomAccessIterator __hole    = __first;
   _RandomAccessIterator __child_i = __first;

@@ -52,8 +52,6 @@ struct __unwrap_iter_impl
   }
 };
 
-#ifndef _LIBCUDACXX_ENABLE_DEBUG_MODE
-
 // It's a contiguous iterator, so we can use a raw pointer instead
 template <class _Iter>
 struct __unwrap_iter_impl<_Iter, true>
@@ -71,8 +69,6 @@ struct __unwrap_iter_impl<_Iter, true>
   }
 };
 
-#endif // !_LIBCUDACXX_ENABLE_DEBUG_MODE
-
 template <class _Iter,
           class _Impl                                             = __unwrap_iter_impl<_Iter>,
           __enable_if_t<is_copy_constructible<_Iter>::value, int> = 0>