From e3c2e2b7ffac6d37033a454ba47fc983dd5999b8 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 17 Sep 2024 19:43:47 +0200 Subject: [PATCH] [PoC]: Implement `cuda::experimental::uninitialized_async_buffer` (#1854) * Implement `cuda::uninitialized_async_buffer` This uninitialized buffer provides a stream ordered allocation of N elements of type T utilitzing a cuda::mr::async_resource to allocate the storage. The buffer takes care of alignment and deallocation of the storage. The user is required to ensure that the lifetime of the memory resource exceeds the lifetime of the buffer. Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com> --- .../uninitialized_async_buffer.cuh | 258 ++++++++++++++++++ .../__container/uninitialized_buffer.cuh | 20 +- .../async_memory_resource.cuh | 3 + cudax/include/cuda/experimental/buffer.cuh | 5 +- cudax/test/CMakeLists.txt | 1 + .../containers/uninitialized_async_buffer.cu | 158 +++++++++++ cudax/test/containers/uninitialized_buffer.cu | 7 +- docs/cudax/container.rst | 3 +- 8 files changed, 437 insertions(+), 18 deletions(-) create mode 100644 cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh create mode 100644 cudax/test/containers/uninitialized_async_buffer.cu diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh new file mode 100644 index 0000000000..eea30c1b69 --- /dev/null +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -0,0 +1,258 @@ +//===----------------------------------------------------------------------===// +// +// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H +#define __CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \ + && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) + +//! @file +//! The \c uninitialized_async_buffer class provides a typed buffer allocated in stream-order from a given memory +//! resource. +namespace cuda::experimental +{ + +//! @rst +//! .. _cudax-containers-uninitialized-async-buffer: +//! +//! Uninitialized stream-ordered type-safe memory storage +//! ------------------------------------------------------ +//! +//! ``uninitialized_async_buffer`` provides a typed buffer allocated in stream order from a given :ref:`async memory +//! resource `. It handles alignment and release of the allocation. +//! The memory is uninitialized, so that a user needs to ensure elements are properly constructed. +//! +//! In addition to being type safe, ``uninitialized_async_buffer`` also takes a set of :ref:`properties +//! ` to ensure that e.g. execution space constraints are checked +//! at compile time. However, only stateless properties can be forwarded. To use a stateful property, +//! implement :ref:`get_property(const uninitialized_async_buffer&, Property) +//! `. +//! +//! .. warning:: +//! +//! ``uninitialized_async_buffer`` uses `stream-ordered allocation +//! `__. It is the user's +//! resposibility to ensure the lifetime of both the provided async resource and the stream exceed the lifetime of +//! the buffer. +//! +//! @endrst +//! @tparam _T the type to be stored in the buffer +//! @tparam _Properties... The properties the allocated memory satisfies +template +class uninitialized_async_buffer +{ +private: + ::cuda::experimental::mr::async_any_resource<_Properties...> __mr_; + ::cuda::stream_ref __stream_ = {}; + size_t __count_ = 0; + void* __buf_ = nullptr; + + //! @brief Determines the allocation size given the alignment and size of `T` + _CCCL_NODISCARD static constexpr size_t __get_allocation_size(const size_t __count) noexcept + { + constexpr size_t __alignment = alignof(_Tp); + return (__count * sizeof(_Tp) + (__alignment - 1)) & ~(__alignment - 1); + } + + //! @brief Determines the properly aligned start of the buffer given the alignment and size of `T` + _CCCL_NODISCARD constexpr _Tp* __get_data() const noexcept + { + constexpr size_t __alignment = alignof(_Tp); + size_t __space = __get_allocation_size(__count_); + void* __ptr = __buf_; + return _CUDA_VSTD::launder( + reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space))); + } + + //! @brief Causes the buffer to be treated as a span when passed to cudax::launch. + //! @pre The buffer must have the cuda::mr::device_accessible property. + _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp> + __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept + { + static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>, + "The buffer must be device accessible to be passed to `launch`"); + return {__self.__get_data(), __self.size()}; + } + + //! @brief Causes the buffer to be treated as a span when passed to cudax::launch + //! @pre The buffer must have the cuda::mr::device_accessible property. + _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span + __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept + { + static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>, + "The buffer must be device accessible to be passed to `launch`"); + return {__self.__get_data(), __self.size()}; + } + +public: + using value_type = _Tp; + using reference = _Tp&; + using pointer = _Tp*; + using size_type = size_t; + + //! @brief Constructs an \c uninitialized_async_buffer, allocating sufficient storage for \p __count elements using + //! \p __mr + //! @param __mr The async memory resource to allocate the buffer with. + //! @param __stream The CUDA stream used for stream-ordered allocation. + //! @param __count The desired size of the buffer. + //! @note Depending on the alignment requirements of `T` the size of the underlying allocation might be larger + //! than `count * sizeof(T)`. Only allocates memory when \p __count > 0 + uninitialized_async_buffer(::cuda::experimental::mr::async_any_resource<_Properties...> __mr, + const ::cuda::stream_ref __stream, + const size_t __count) + : __mr_(_CUDA_VSTD::move(__mr)) + , __stream_(__stream) + , __count_(__count) + , __buf_(__count_ == 0 ? nullptr : __mr_.allocate_async(__get_allocation_size(__count_), __stream_)) + {} + + uninitialized_async_buffer(const uninitialized_async_buffer&) = delete; + uninitialized_async_buffer& operator=(const uninitialized_async_buffer&) = delete; + + //! @brief Move construction + //! @param __other Another \c uninitialized_async_buffer + uninitialized_async_buffer(uninitialized_async_buffer&& __other) noexcept + : __mr_(_CUDA_VSTD::move(__other.__mr_)) + , __stream_(_CUDA_VSTD::exchange(__other.__stream_, {})) + , __count_(_CUDA_VSTD::exchange(__other.__count_, 0)) + , __buf_(_CUDA_VSTD::exchange(__other.__buf_, nullptr)) + {} + + //! @brief Move assignment + //! @param __other Another \c uninitialized_async_buffer + uninitialized_async_buffer& operator=(uninitialized_async_buffer&& __other) noexcept + { + if (this == _CUDA_VSTD::addressof(__other)) + { + return *this; + } + + if (__buf_) + { + __mr_.deallocate_async(__buf_, __get_allocation_size(__count_), __stream_); + } + __mr_ = __other.__mr_; + __stream_ = _CUDA_VSTD::exchange(__other.__stream_, {}); + __count_ = _CUDA_VSTD::exchange(__other.__count_, 0); + __buf_ = _CUDA_VSTD::exchange(__other.__buf_, nullptr); + return *this; + } + //! @brief Destroys an \c uninitialized_async_buffer and deallocates the buffer in stream order on the stream that was + //! used to create the buffer. + //! @warning The destructor does not destroy any objects that may or may not reside within the buffer. It is the + //! user's responsibility to ensure that all objects within the buffer have been properly destroyed. + ~uninitialized_async_buffer() + { + if (__buf_) + { + __mr_.deallocate_async(__buf_, __get_allocation_size(__count_), __stream_); + } + } + + //! @brief Returns an aligned pointer to the buffer + _CCCL_NODISCARD constexpr pointer begin() const noexcept + { + return __get_data(); + } + + //! @brief Returns an aligned pointer to the element following the last element of the buffer. + //! This element acts as a placeholder; attempting to access it results in undefined behavior. + _CCCL_NODISCARD constexpr pointer end() const noexcept + { + return __get_data() + __count_; + } + + //! @brief Returns an aligned pointer to the buffer + _CCCL_NODISCARD constexpr pointer data() const noexcept + { + return __get_data(); + } + + //! @brief Returns the size of the buffer + _CCCL_NODISCARD constexpr size_t size() const noexcept + { + return __count_; + } + + //! @rst + //! Returns an :ref:`asnyc_resource_ref ` to the resource used + //! to allocate the buffer + //! @endrst + _CCCL_NODISCARD _CUDA_VMR::async_resource_ref<_Properties...> get_resource() const noexcept + { + return _CUDA_VMR::async_resource_ref<_Properties...>{const_cast(this)->__mr_}; + } + + //! @brief Returns the stored stream + _CCCL_NODISCARD constexpr ::cuda::stream_ref get_stream() const noexcept + { + return __stream_; + } + + //! @brief Replaces the stored stream + //! @param __new_stream the new stream + //! @note Always synchronizes with the old stream + constexpr void change_stream(::cuda::stream_ref __new_stream) + { + if (__new_stream != __stream_) + { + __stream_.wait(); + } + __stream_ = __new_stream; + } + + //! @brief Swaps the contents with those of another \c uninitialized_async_buffer + //! @param __other The other \c uninitialized_async_buffer. + constexpr void swap(uninitialized_async_buffer& __other) noexcept + { + _CUDA_VSTD::swap(__mr_, __other.__mr_); + _CUDA_VSTD::swap(__count_, __other.__count_); + _CUDA_VSTD::swap(__buf_, __other.__buf_); + } + +# ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken + //! @brief Forwards the passed properties + _LIBCUDACXX_TEMPLATE(class _Property) + _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::_One_of<_Property, _Properties...>) + friend constexpr void get_property(const uninitialized_async_buffer&, _Property) noexcept {} +# endif // DOXYGEN_SHOULD_SKIP_THIS +}; + +template +using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, _CUDA_VMR::device_accessible>; + +} // namespace cuda::experimental + +#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE + +#endif //__CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 817b9782af..9c88df1d95 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -128,12 +129,9 @@ public: //! @param __other Another \c uninitialized_buffer uninitialized_buffer(uninitialized_buffer&& __other) noexcept : __mr_(_CUDA_VSTD::move(__other.__mr_)) - , __count_(__other.__count_) - , __buf_(__other.__buf_) - { - __other.__count_ = 0; - __other.__buf_ = nullptr; - } + , __count_(_CUDA_VSTD::exchange(__other.__count_, 0)) + , __buf_(_CUDA_VSTD::exchange(__other.__buf_, nullptr)) + {} //! @brief Move assignment //! @param __other Another \c uninitialized_buffer @@ -148,11 +146,9 @@ public: { __mr_.deallocate(__buf_, __get_allocation_size(__count_)); } - __mr_ = _CUDA_VSTD::move(__other.__mr_); - __count_ = __other.__count_; - __buf_ = __other.__buf_; - __other.__count_ = 0; - __other.__buf_ = nullptr; + __mr_ = _CUDA_VSTD::move(__other.__mr_); + __count_ = _CUDA_VSTD::exchange(__other.__count_, 0); + __buf_ = _CUDA_VSTD::exchange(__other.__buf_, nullptr); return *this; } @@ -196,7 +192,7 @@ public: //! allocate the buffer //! @endrst _CCCL_EXEC_CHECK_DISABLE - _CCCL_NODISCARD _CCCL_HOST_DEVICE _CUDA_VMR::resource_ref<_Properties...> resource() const noexcept + _CCCL_NODISCARD _CCCL_HOST_DEVICE _CUDA_VMR::resource_ref<_Properties...> get_resource() const noexcept { return _CUDA_VMR::resource_ref<_Properties...>{const_cast(this)->__mr_}; } diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh index fb2326dfab..b9fd038dd9 100644 --- a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh @@ -44,6 +44,9 @@ # if _CCCL_STD_VER >= 2014 +//! @file +//! The \c async_memory_pool class provides an asynchronous memory resource that allocates device memory in stream +//! order. namespace cuda::experimental::mr { diff --git a/cudax/include/cuda/experimental/buffer.cuh b/cudax/include/cuda/experimental/buffer.cuh index 93dc454d39..061b884e29 100644 --- a/cudax/include/cuda/experimental/buffer.cuh +++ b/cudax/include/cuda/experimental/buffer.cuh @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef __CUDAX_BUFFER -#define __CUDAX_BUFFER +#ifndef __CUDAX_BUFFER__ +#define __CUDAX_BUFFER__ #include @@ -20,6 +20,7 @@ # pragma system_header #endif // no system header +#include #include #endif // __CUDAX_BUFFER diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index d4ec714e07..a50ab0b1ce 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -88,6 +88,7 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_catch2_test(test_target containers ${cn_target} containers/uninitialized_buffer.cu + containers/uninitialized_async_buffer.cu ) cudax_add_catch2_test(test_target memory_resource ${cn_target} diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu new file mode 100644 index 0000000000..b32d322827 --- /dev/null +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -0,0 +1,158 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "testing.cuh" +#include + +struct do_not_construct +{ + do_not_construct() + { + CHECK(false); + } +}; + +struct my_property +{ + using value_type = int; +}; +constexpr int get_property(const cuda::experimental::uninitialized_async_buffer&, my_property) +{ + return 42; +} + +TEMPLATE_TEST_CASE( + "uninitialized_async_buffer", "[container]", char, short, int, long, long long, float, double, do_not_construct) +{ + using uninitialized_async_buffer = cuda::experimental::uninitialized_async_buffer; + static_assert(!cuda::std::is_default_constructible::value, ""); + static_assert(!cuda::std::is_copy_constructible::value, ""); + static_assert(!cuda::std::is_copy_assignable::value, ""); + + cuda::experimental::mr::async_memory_resource resource{}; + cuda::experimental::stream stream{}; + + SECTION("construction") + { + { + uninitialized_async_buffer from_stream_count{resource, stream, 42}; + CUDAX_CHECK(from_stream_count.data() != nullptr); + CUDAX_CHECK(from_stream_count.size() == 42); + } + + { + uninitialized_async_buffer input{resource, stream, 42}; + const TestType* ptr = input.data(); + + uninitialized_async_buffer from_rvalue{cuda::std::move(input)}; + CUDAX_CHECK(from_rvalue.data() == ptr); + CUDAX_CHECK(from_rvalue.size() == 42); + CUDAX_CHECK(from_rvalue.get_stream() == stream); + + // Ensure that we properly reset the input buffer + CUDAX_CHECK(input.data() == nullptr); + CUDAX_CHECK(input.size() == 0); + CUDAX_CHECK(input.get_stream() == cuda::stream_ref{}); + } + } + + SECTION("assignment") + { + static_assert(!cuda::std::is_copy_assignable::value, ""); + + { + cuda::experimental::stream other_stream{}; + uninitialized_async_buffer input{resource, other_stream, 42}; + const TestType* ptr = input.data(); + + uninitialized_async_buffer assign_rvalue{resource, stream, 1337}; + assign_rvalue = cuda::std::move(input); + CUDAX_CHECK(assign_rvalue.data() == ptr); + CUDAX_CHECK(assign_rvalue.size() == 42); + CUDAX_CHECK(assign_rvalue.get_stream() == other_stream); + + // Ensure that we properly reset the input buffer + CUDAX_CHECK(input.data() == nullptr); + CUDAX_CHECK(input.size() == 0); + CUDAX_CHECK(input.get_stream() == cuda::stream_ref{}); + } + + { // Ensure self move assignment doesnt do anything + uninitialized_async_buffer buf{resource, stream, 42}; + const auto* old_ptr = buf.data(); + + buf = cuda::std::move(buf); + CUDAX_CHECK(buf.data() == old_ptr); + CUDAX_CHECK(buf.get_stream() == stream); + CUDAX_CHECK(buf.size() == 42); + } + } + + SECTION("access") + { + uninitialized_async_buffer buf{resource, stream, 42}; + CUDAX_CHECK(buf.data() != nullptr); + CUDAX_CHECK(buf.size() == 42); + CUDAX_CHECK(buf.begin() == buf.data()); + CUDAX_CHECK(buf.end() == buf.begin() + buf.size()); + CUDAX_CHECK(buf.get_stream() == stream); + CUDAX_CHECK(buf.get_resource() == resource); + + CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); + CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); + CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data()); + CUDAX_CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); + CUDAX_CHECK(cuda::std::as_const(buf).get_stream() == stream); + CUDAX_CHECK(cuda::std::as_const(buf).get_resource() == resource); + } + + SECTION("properties") + { + static_assert(cuda::has_property, + cuda::mr::device_accessible>, + ""); + static_assert(cuda::has_property, my_property>, + ""); + } + + SECTION("conversion to span") + { + uninitialized_async_buffer buf{resource, stream, 42}; + const cuda::std::span as_span{buf}; + CUDAX_CHECK(as_span.data() == buf.data()); + CUDAX_CHECK(as_span.size() == 42); + } + + SECTION("Actually use memory") + { + if constexpr (!cuda::std::is_same_v) + { + uninitialized_async_buffer buf{resource, stream, 42}; + stream.wait(); + thrust::fill(thrust::device, buf.begin(), buf.end(), TestType{2}); + const auto res = thrust::reduce(thrust::device, buf.begin(), buf.end(), TestType{0}, thrust::plus()); + CUDAX_CHECK(res == TestType{84}); + } + } +} diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index 0386588fea..c924750a8a 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -23,6 +23,7 @@ #include #include "testing.cuh" +#include struct do_not_construct { @@ -101,7 +102,7 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.data() != old_ptr); CUDAX_CHECK(buf.data() == old_input_ptr); CUDAX_CHECK(buf.size() == 42); - CUDAX_CHECK(buf.resource() == other_resource); + CUDAX_CHECK(buf.get_resource() == other_resource); CUDAX_CHECK(input.data() == nullptr); CUDAX_CHECK(input.size() == 0); @@ -124,13 +125,13 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.size() == 42); CUDAX_CHECK(buf.begin() == buf.data()); CUDAX_CHECK(buf.end() == buf.begin() + buf.size()); - CUDAX_CHECK(buf.resource() == resource); + CUDAX_CHECK(buf.get_resource() == resource); CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data()); CUDAX_CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); - CUDAX_CHECK(cuda::std::as_const(buf).resource() == resource); + CUDAX_CHECK(cuda::std::as_const(buf).get_resource() == resource); } SECTION("properties") diff --git a/docs/cudax/container.rst b/docs/cudax/container.rst index 66a2ec6ad8..968b85f56d 100644 --- a/docs/cudax/container.rst +++ b/docs/cudax/container.rst @@ -8,6 +8,7 @@ Containers library :maxdepth: 1 ${repo_docs_api_path}/class*uninitialized__buffer* + ${repo_docs_api_path}/class*uninitialized__async__buffer* The headers of the container library provide facilities to store elements on the heap. They are heavily inspired by the C++ `containers library `__ but deviate from the standard provided ones due to different requirements from @@ -21,5 +22,5 @@ annotations are checked by the type system. :header-rows: 0 * - :ref:` ` - - Facilities providing uninitialized *heterogeneous* storage satisfying a set of properties + - Facilities providing uninitialized *heterogeneous* potentially stream ordered storage satisfying a set of properties - cudax 2.7.0 / CCCL 2.7.0