diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh new file mode 100644 index 0000000000..eea30c1b69 --- /dev/null +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -0,0 +1,258 @@ +//===----------------------------------------------------------------------===// +// +// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H +#define __CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \ + && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) + +//! @file +//! The \c uninitialized_async_buffer class provides a typed buffer allocated in stream-order from a given memory +//! resource. +namespace cuda::experimental +{ + +//! @rst +//! .. _cudax-containers-uninitialized-async-buffer: +//! +//! Uninitialized stream-ordered type-safe memory storage +//! ------------------------------------------------------ +//! +//! ``uninitialized_async_buffer`` provides a typed buffer allocated in stream order from a given :ref:`async memory +//! resource `. It handles alignment and release of the allocation. +//! The memory is uninitialized, so that a user needs to ensure elements are properly constructed. +//! +//! In addition to being type safe, ``uninitialized_async_buffer`` also takes a set of :ref:`properties +//! ` to ensure that e.g. execution space constraints are checked +//! at compile time. However, only stateless properties can be forwarded. To use a stateful property, +//! implement :ref:`get_property(const uninitialized_async_buffer&, Property) +//! `. +//! +//! .. warning:: +//! +//! ``uninitialized_async_buffer`` uses `stream-ordered allocation +//! `__. It is the user's +//! resposibility to ensure the lifetime of both the provided async resource and the stream exceed the lifetime of +//! the buffer. +//! +//! @endrst +//! @tparam _T the type to be stored in the buffer +//! @tparam _Properties... The properties the allocated memory satisfies +template +class uninitialized_async_buffer +{ +private: + ::cuda::experimental::mr::async_any_resource<_Properties...> __mr_; + ::cuda::stream_ref __stream_ = {}; + size_t __count_ = 0; + void* __buf_ = nullptr; + + //! @brief Determines the allocation size given the alignment and size of `T` + _CCCL_NODISCARD static constexpr size_t __get_allocation_size(const size_t __count) noexcept + { + constexpr size_t __alignment = alignof(_Tp); + return (__count * sizeof(_Tp) + (__alignment - 1)) & ~(__alignment - 1); + } + + //! @brief Determines the properly aligned start of the buffer given the alignment and size of `T` + _CCCL_NODISCARD constexpr _Tp* __get_data() const noexcept + { + constexpr size_t __alignment = alignof(_Tp); + size_t __space = __get_allocation_size(__count_); + void* __ptr = __buf_; + return _CUDA_VSTD::launder( + reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space))); + } + + //! @brief Causes the buffer to be treated as a span when passed to cudax::launch. + //! @pre The buffer must have the cuda::mr::device_accessible property. + _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp> + __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept + { + static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>, + "The buffer must be device accessible to be passed to `launch`"); + return {__self.__get_data(), __self.size()}; + } + + //! @brief Causes the buffer to be treated as a span when passed to cudax::launch + //! @pre The buffer must have the cuda::mr::device_accessible property. + _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span + __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept + { + static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>, + "The buffer must be device accessible to be passed to `launch`"); + return {__self.__get_data(), __self.size()}; + } + +public: + using value_type = _Tp; + using reference = _Tp&; + using pointer = _Tp*; + using size_type = size_t; + + //! @brief Constructs an \c uninitialized_async_buffer, allocating sufficient storage for \p __count elements using + //! \p __mr + //! @param __mr The async memory resource to allocate the buffer with. + //! @param __stream The CUDA stream used for stream-ordered allocation. + //! @param __count The desired size of the buffer. + //! @note Depending on the alignment requirements of `T` the size of the underlying allocation might be larger + //! than `count * sizeof(T)`. Only allocates memory when \p __count > 0 + uninitialized_async_buffer(::cuda::experimental::mr::async_any_resource<_Properties...> __mr, + const ::cuda::stream_ref __stream, + const size_t __count) + : __mr_(_CUDA_VSTD::move(__mr)) + , __stream_(__stream) + , __count_(__count) + , __buf_(__count_ == 0 ? nullptr : __mr_.allocate_async(__get_allocation_size(__count_), __stream_)) + {} + + uninitialized_async_buffer(const uninitialized_async_buffer&) = delete; + uninitialized_async_buffer& operator=(const uninitialized_async_buffer&) = delete; + + //! @brief Move construction + //! @param __other Another \c uninitialized_async_buffer + uninitialized_async_buffer(uninitialized_async_buffer&& __other) noexcept + : __mr_(_CUDA_VSTD::move(__other.__mr_)) + , __stream_(_CUDA_VSTD::exchange(__other.__stream_, {})) + , __count_(_CUDA_VSTD::exchange(__other.__count_, 0)) + , __buf_(_CUDA_VSTD::exchange(__other.__buf_, nullptr)) + {} + + //! @brief Move assignment + //! @param __other Another \c uninitialized_async_buffer + uninitialized_async_buffer& operator=(uninitialized_async_buffer&& __other) noexcept + { + if (this == _CUDA_VSTD::addressof(__other)) + { + return *this; + } + + if (__buf_) + { + __mr_.deallocate_async(__buf_, __get_allocation_size(__count_), __stream_); + } + __mr_ = __other.__mr_; + __stream_ = _CUDA_VSTD::exchange(__other.__stream_, {}); + __count_ = _CUDA_VSTD::exchange(__other.__count_, 0); + __buf_ = _CUDA_VSTD::exchange(__other.__buf_, nullptr); + return *this; + } + //! @brief Destroys an \c uninitialized_async_buffer and deallocates the buffer in stream order on the stream that was + //! used to create the buffer. + //! @warning The destructor does not destroy any objects that may or may not reside within the buffer. It is the + //! user's responsibility to ensure that all objects within the buffer have been properly destroyed. + ~uninitialized_async_buffer() + { + if (__buf_) + { + __mr_.deallocate_async(__buf_, __get_allocation_size(__count_), __stream_); + } + } + + //! @brief Returns an aligned pointer to the buffer + _CCCL_NODISCARD constexpr pointer begin() const noexcept + { + return __get_data(); + } + + //! @brief Returns an aligned pointer to the element following the last element of the buffer. + //! This element acts as a placeholder; attempting to access it results in undefined behavior. + _CCCL_NODISCARD constexpr pointer end() const noexcept + { + return __get_data() + __count_; + } + + //! @brief Returns an aligned pointer to the buffer + _CCCL_NODISCARD constexpr pointer data() const noexcept + { + return __get_data(); + } + + //! @brief Returns the size of the buffer + _CCCL_NODISCARD constexpr size_t size() const noexcept + { + return __count_; + } + + //! @rst + //! Returns an :ref:`asnyc_resource_ref ` to the resource used + //! to allocate the buffer + //! @endrst + _CCCL_NODISCARD _CUDA_VMR::async_resource_ref<_Properties...> get_resource() const noexcept + { + return _CUDA_VMR::async_resource_ref<_Properties...>{const_cast(this)->__mr_}; + } + + //! @brief Returns the stored stream + _CCCL_NODISCARD constexpr ::cuda::stream_ref get_stream() const noexcept + { + return __stream_; + } + + //! @brief Replaces the stored stream + //! @param __new_stream the new stream + //! @note Always synchronizes with the old stream + constexpr void change_stream(::cuda::stream_ref __new_stream) + { + if (__new_stream != __stream_) + { + __stream_.wait(); + } + __stream_ = __new_stream; + } + + //! @brief Swaps the contents with those of another \c uninitialized_async_buffer + //! @param __other The other \c uninitialized_async_buffer. + constexpr void swap(uninitialized_async_buffer& __other) noexcept + { + _CUDA_VSTD::swap(__mr_, __other.__mr_); + _CUDA_VSTD::swap(__count_, __other.__count_); + _CUDA_VSTD::swap(__buf_, __other.__buf_); + } + +# ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken + //! @brief Forwards the passed properties + _LIBCUDACXX_TEMPLATE(class _Property) + _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::_One_of<_Property, _Properties...>) + friend constexpr void get_property(const uninitialized_async_buffer&, _Property) noexcept {} +# endif // DOXYGEN_SHOULD_SKIP_THIS +}; + +template +using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, _CUDA_VMR::device_accessible>; + +} // namespace cuda::experimental + +#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE + +#endif //__CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 817b9782af..9c88df1d95 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -128,12 +129,9 @@ public: //! @param __other Another \c uninitialized_buffer uninitialized_buffer(uninitialized_buffer&& __other) noexcept : __mr_(_CUDA_VSTD::move(__other.__mr_)) - , __count_(__other.__count_) - , __buf_(__other.__buf_) - { - __other.__count_ = 0; - __other.__buf_ = nullptr; - } + , __count_(_CUDA_VSTD::exchange(__other.__count_, 0)) + , __buf_(_CUDA_VSTD::exchange(__other.__buf_, nullptr)) + {} //! @brief Move assignment //! @param __other Another \c uninitialized_buffer @@ -148,11 +146,9 @@ public: { __mr_.deallocate(__buf_, __get_allocation_size(__count_)); } - __mr_ = _CUDA_VSTD::move(__other.__mr_); - __count_ = __other.__count_; - __buf_ = __other.__buf_; - __other.__count_ = 0; - __other.__buf_ = nullptr; + __mr_ = _CUDA_VSTD::move(__other.__mr_); + __count_ = _CUDA_VSTD::exchange(__other.__count_, 0); + __buf_ = _CUDA_VSTD::exchange(__other.__buf_, nullptr); return *this; } @@ -196,7 +192,7 @@ public: //! allocate the buffer //! @endrst _CCCL_EXEC_CHECK_DISABLE - _CCCL_NODISCARD _CCCL_HOST_DEVICE _CUDA_VMR::resource_ref<_Properties...> resource() const noexcept + _CCCL_NODISCARD _CCCL_HOST_DEVICE _CUDA_VMR::resource_ref<_Properties...> get_resource() const noexcept { return _CUDA_VMR::resource_ref<_Properties...>{const_cast(this)->__mr_}; } diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh index fb2326dfab..b9fd038dd9 100644 --- a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh @@ -44,6 +44,9 @@ # if _CCCL_STD_VER >= 2014 +//! @file +//! The \c async_memory_pool class provides an asynchronous memory resource that allocates device memory in stream +//! order. namespace cuda::experimental::mr { diff --git a/cudax/include/cuda/experimental/buffer.cuh b/cudax/include/cuda/experimental/buffer.cuh index 93dc454d39..061b884e29 100644 --- a/cudax/include/cuda/experimental/buffer.cuh +++ b/cudax/include/cuda/experimental/buffer.cuh @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef __CUDAX_BUFFER -#define __CUDAX_BUFFER +#ifndef __CUDAX_BUFFER__ +#define __CUDAX_BUFFER__ #include @@ -20,6 +20,7 @@ # pragma system_header #endif // no system header +#include #include #endif // __CUDAX_BUFFER diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index d4ec714e07..a50ab0b1ce 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -88,6 +88,7 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_catch2_test(test_target containers ${cn_target} containers/uninitialized_buffer.cu + containers/uninitialized_async_buffer.cu ) cudax_add_catch2_test(test_target memory_resource ${cn_target} diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu new file mode 100644 index 0000000000..b32d322827 --- /dev/null +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -0,0 +1,158 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "testing.cuh" +#include + +struct do_not_construct +{ + do_not_construct() + { + CHECK(false); + } +}; + +struct my_property +{ + using value_type = int; +}; +constexpr int get_property(const cuda::experimental::uninitialized_async_buffer&, my_property) +{ + return 42; +} + +TEMPLATE_TEST_CASE( + "uninitialized_async_buffer", "[container]", char, short, int, long, long long, float, double, do_not_construct) +{ + using uninitialized_async_buffer = cuda::experimental::uninitialized_async_buffer; + static_assert(!cuda::std::is_default_constructible::value, ""); + static_assert(!cuda::std::is_copy_constructible::value, ""); + static_assert(!cuda::std::is_copy_assignable::value, ""); + + cuda::experimental::mr::async_memory_resource resource{}; + cuda::experimental::stream stream{}; + + SECTION("construction") + { + { + uninitialized_async_buffer from_stream_count{resource, stream, 42}; + CUDAX_CHECK(from_stream_count.data() != nullptr); + CUDAX_CHECK(from_stream_count.size() == 42); + } + + { + uninitialized_async_buffer input{resource, stream, 42}; + const TestType* ptr = input.data(); + + uninitialized_async_buffer from_rvalue{cuda::std::move(input)}; + CUDAX_CHECK(from_rvalue.data() == ptr); + CUDAX_CHECK(from_rvalue.size() == 42); + CUDAX_CHECK(from_rvalue.get_stream() == stream); + + // Ensure that we properly reset the input buffer + CUDAX_CHECK(input.data() == nullptr); + CUDAX_CHECK(input.size() == 0); + CUDAX_CHECK(input.get_stream() == cuda::stream_ref{}); + } + } + + SECTION("assignment") + { + static_assert(!cuda::std::is_copy_assignable::value, ""); + + { + cuda::experimental::stream other_stream{}; + uninitialized_async_buffer input{resource, other_stream, 42}; + const TestType* ptr = input.data(); + + uninitialized_async_buffer assign_rvalue{resource, stream, 1337}; + assign_rvalue = cuda::std::move(input); + CUDAX_CHECK(assign_rvalue.data() == ptr); + CUDAX_CHECK(assign_rvalue.size() == 42); + CUDAX_CHECK(assign_rvalue.get_stream() == other_stream); + + // Ensure that we properly reset the input buffer + CUDAX_CHECK(input.data() == nullptr); + CUDAX_CHECK(input.size() == 0); + CUDAX_CHECK(input.get_stream() == cuda::stream_ref{}); + } + + { // Ensure self move assignment doesnt do anything + uninitialized_async_buffer buf{resource, stream, 42}; + const auto* old_ptr = buf.data(); + + buf = cuda::std::move(buf); + CUDAX_CHECK(buf.data() == old_ptr); + CUDAX_CHECK(buf.get_stream() == stream); + CUDAX_CHECK(buf.size() == 42); + } + } + + SECTION("access") + { + uninitialized_async_buffer buf{resource, stream, 42}; + CUDAX_CHECK(buf.data() != nullptr); + CUDAX_CHECK(buf.size() == 42); + CUDAX_CHECK(buf.begin() == buf.data()); + CUDAX_CHECK(buf.end() == buf.begin() + buf.size()); + CUDAX_CHECK(buf.get_stream() == stream); + CUDAX_CHECK(buf.get_resource() == resource); + + CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); + CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); + CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data()); + CUDAX_CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); + CUDAX_CHECK(cuda::std::as_const(buf).get_stream() == stream); + CUDAX_CHECK(cuda::std::as_const(buf).get_resource() == resource); + } + + SECTION("properties") + { + static_assert(cuda::has_property, + cuda::mr::device_accessible>, + ""); + static_assert(cuda::has_property, my_property>, + ""); + } + + SECTION("conversion to span") + { + uninitialized_async_buffer buf{resource, stream, 42}; + const cuda::std::span as_span{buf}; + CUDAX_CHECK(as_span.data() == buf.data()); + CUDAX_CHECK(as_span.size() == 42); + } + + SECTION("Actually use memory") + { + if constexpr (!cuda::std::is_same_v) + { + uninitialized_async_buffer buf{resource, stream, 42}; + stream.wait(); + thrust::fill(thrust::device, buf.begin(), buf.end(), TestType{2}); + const auto res = thrust::reduce(thrust::device, buf.begin(), buf.end(), TestType{0}, thrust::plus()); + CUDAX_CHECK(res == TestType{84}); + } + } +} diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index 0386588fea..c924750a8a 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -23,6 +23,7 @@ #include #include "testing.cuh" +#include struct do_not_construct { @@ -101,7 +102,7 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.data() != old_ptr); CUDAX_CHECK(buf.data() == old_input_ptr); CUDAX_CHECK(buf.size() == 42); - CUDAX_CHECK(buf.resource() == other_resource); + CUDAX_CHECK(buf.get_resource() == other_resource); CUDAX_CHECK(input.data() == nullptr); CUDAX_CHECK(input.size() == 0); @@ -124,13 +125,13 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.size() == 42); CUDAX_CHECK(buf.begin() == buf.data()); CUDAX_CHECK(buf.end() == buf.begin() + buf.size()); - CUDAX_CHECK(buf.resource() == resource); + CUDAX_CHECK(buf.get_resource() == resource); CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data()); CUDAX_CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); - CUDAX_CHECK(cuda::std::as_const(buf).resource() == resource); + CUDAX_CHECK(cuda::std::as_const(buf).get_resource() == resource); } SECTION("properties") diff --git a/docs/cudax/container.rst b/docs/cudax/container.rst index 66a2ec6ad8..968b85f56d 100644 --- a/docs/cudax/container.rst +++ b/docs/cudax/container.rst @@ -8,6 +8,7 @@ Containers library :maxdepth: 1 ${repo_docs_api_path}/class*uninitialized__buffer* + ${repo_docs_api_path}/class*uninitialized__async__buffer* The headers of the container library provide facilities to store elements on the heap. They are heavily inspired by the C++ `containers library `__ but deviate from the standard provided ones due to different requirements from @@ -21,5 +22,5 @@ annotations are checked by the type system. :header-rows: 0 * - :ref:` ` - - Facilities providing uninitialized *heterogeneous* storage satisfying a set of properties + - Facilities providing uninitialized *heterogeneous* potentially stream ordered storage satisfying a set of properties - cudax 2.7.0 / CCCL 2.7.0