diff --git a/c/include/cccl/reduce.h b/c/include/cccl/c/reduce.h similarity index 85% rename from c/include/cccl/reduce.h rename to c/include/cccl/c/reduce.h index 5047625a85..1da7b51f01 100644 --- a/c/include/cccl/reduce.h +++ b/c/include/cccl/c/reduce.h @@ -11,12 +11,12 @@ #pragma once #ifndef CCCL_C_EXPERIMENTAL -# warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning." -#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv +# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice." +#endif // !CCCL_C_EXPERIMENTAL -# include +#include -# include +#include struct cccl_device_reduce_build_result_t { @@ -55,5 +55,3 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( CUstream stream) noexcept; extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr); - -#endif // CCCL_C_EXPERIMENTAL diff --git a/c/include/cccl/types.h b/c/include/cccl/c/types.h similarity index 78% rename from c/include/cccl/types.h rename to c/include/cccl/c/types.h index 781b9f9ea6..f34466c755 100644 --- a/c/include/cccl/types.h +++ b/c/include/cccl/c/types.h @@ -11,14 +11,14 @@ #pragma once #ifndef CCCL_C_EXPERIMENTAL -# warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning." -#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv +# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice." +#endif // !CCCL_C_EXPERIMENTAL -# if defined(_WIN32) -# define CCCL_C_API __declspec(dllexport) -# else -# define CCCL_C_API __attribute__((visibility("default"))) -# endif +#if defined(_WIN32) +# define CCCL_C_API __declspec(dllexport) +#else // ^^^ _WIN32 ^^^ / vvv !_WIN32 vvv +# define CCCL_C_API __attribute__((visibility("default"))) +#endif // !_WIN32 enum class cccl_type_enum { @@ -81,5 +81,3 @@ struct cccl_iterator_t cccl_type_info value_type; void* state; }; - -#endif // CCCL_C_EXPERIMENTAL diff --git a/c/src/reduce.cu b/c/src/reduce.cu index 4badcd1ff0..fc88dd31a9 100644 --- a/c/src/reduce.cu +++ b/c/src/reduce.cu @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/c/test/c2h.h b/c/test/c2h.h index e2b26895a8..e044d2e17a 100644 --- a/c/test/c2h.h +++ b/c/test/c2h.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include static std::string inspect_sass(const void* cubin, size_t cubin_size) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 63afe3844c..2f06a666e8 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -204,7 +204,7 @@ public: } //! @rst - //! Returns a \c const reference to the :ref:`any_async_resource ` + //! Returns a \c const reference to the :ref:`any_async_resource ` //! that holds the memory resource used to allocate the buffer //! @endrst _CCCL_NODISCARD const __async_resource& get_resource() const noexcept diff --git a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh new file mode 100644 index 0000000000..bdd774f216 --- /dev/null +++ b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh @@ -0,0 +1,273 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H +#define _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// If the memory resource header was included without the experimental flag, +// tell the user to define the experimental flag. +#if defined(_CUDA_MEMORY_RESOURCE) && !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +# error "To use the experimental memory resource, define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE" +#endif + +// cuda::mr is unavable on MSVC 2017 +#if defined(_CCCL_COMPILER_MSVC_2017) +# error "The shared_resource header is not supported on MSVC 2017" +#endif + +#if !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +# define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include + +namespace cuda::experimental::mr +{ + +//! @rst +//! .. _cudax-memory-resource-shared-resource: +//! +//! Resource wrapper to share ownership of a resource +//! -------------------------------------------------- +//! +//! ``shared_resource`` holds a reference counted instance of a memory resource. This allows +//! the user to pass a resource around with reference semantics while avoiding lifetime issues. +//! +//! @note ``shared_resource`` satisfies the ``cuda::mr::async_resource`` concept iff \tparam _Resource satisfies it. +//! @tparam _Resource The resource type to hold. +//! @endrst +template +struct shared_resource +{ + static_assert(_CUDA_VMR::resource<_Resource>, ""); + + //! @brief Constructs a \c shared_resource refering to an object of type \c _Resource + //! that has been constructed with arguments \c __args. The \c _Resource object is + //! dynamically allocated with \c new. + //! @param __args The arguments to be passed to the \c _Resource constructor. + template + explicit shared_resource(_Args&&... __args) + : __control_block(new _Control_block{_Resource{_CUDA_VSTD::forward<_Args>(__args)...}, 1}) + {} + + //! @brief Copy-constructs a \c shared_resource object resulting in an copy that shares + //! ownership of the wrapped resource with \c __other. + //! @param __other The \c shared_resource object to copy from. + shared_resource(const shared_resource& __other) noexcept + : __control_block(__other.__control_block) + { + if (__control_block) + { + __control_block->__ref_count.fetch_add(1, _CUDA_VSTD::memory_order_relaxed); + } + } + + //! @brief Move-constructs a \c shared_resource assuming ownership of the resource stored + //! in \c __other. + //! @param __other The \c shared_resource object to move from. + //! @post \c __other is left in a valid but unspecified state. + shared_resource(shared_resource&& __other) noexcept + : __control_block(_CUDA_VSTD::exchange(__other.__control_block, nullptr)) + {} + + //! @brief Releases the reference held by this \c shared_resource object. If this is the + //! last reference to the wrapped resource, the resource is deleted. + ~shared_resource() + { + if (__control_block && __control_block->__ref_count.fetch_sub(1, _CUDA_VSTD::memory_order_acq_rel) == 1) + { + delete __control_block; + } + } + + //! @brief Copy-assigns from \c __other. Self-assignment is a no-op. Otherwise, the reference + //! held by this \c shared_resource object is released and a new reference is acquired to the + //! wrapped resource of \c __other, if any. + //! @param __other The \c shared_resource object to copy from. + shared_resource& operator=(const shared_resource& __other) noexcept + { + if (this != &__other) + { + shared_resource(__other).swap(*this); + } + + return *this; + } + + //! @brief Move-assigns from \c __other. Self-assignment is a no-op. Otherwise, the reference + //! held by this \c shared_resource object is released, while the reference held by \c __other + //! is transferred to this object. + //! @param __other The \c shared_resource object to move from. + /// @post \c __other is left in a valid but unspecified state. + shared_resource& operator=(shared_resource&& __other) noexcept + { + if (this != &__other) + { + shared_resource(_CUDA_VSTD::move(__other)).swap(*this); + } + + return *this; + } + + //! @brief Swaps a \c shared_resource with another one. + //! @param __other The other \c shared_resource. + void swap(shared_resource& __other) noexcept + { + _CUDA_VSTD::swap(__control_block, __other.__control_block); + } + + //! @brief Swaps a \c shared_resource with another one. + //! @param __other The other \c shared_resource. + friend void swap(shared_resource& __left, shared_resource& __right) noexcept + { + __left.swap(__right); + } + + //! @brief Allocate memory of size at least \p __bytes using the stored resource. + //! @param __bytes The size in bytes of the allocation. + //! @param __alignment The requested alignment of the allocation. + //! @return Pointer to the newly allocated memory + _CCCL_NODISCARD void* allocate(size_t __bytes, size_t __alignment = alignof(_CUDA_VSTD::max_align_t)) + { + return __control_block->__resource.allocate(__bytes, __alignment); + } + + //! @brief Deallocate memory pointed to by \p __ptr using the stored resource. + //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate` + //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr. + //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr. + void deallocate(void* __ptr, size_t __bytes, size_t __alignment = alignof(_CUDA_VSTD::max_align_t)) noexcept + { + __control_block->__resource.deallocate(__ptr, __bytes, __alignment); + } + + //! @brief Enqueues an allocation of memory of size at least \p __bytes using + //! the wrapped resource. The allocation is performed asynchronously on stream \c __stream. + //! @pre \c _Resource must satisfy \c async_resource. + //! @param __bytes The size in bytes of the allocation. + //! @param __alignment The requested alignment of the allocation. + //! @return Pointer to the newly allocated memory. + //! @note The caller is responsible for ensuring that the memory is not accessed until the + //! operation has completed. + _LIBCUDACXX_TEMPLATE(class _ThisResource = _Resource) + _LIBCUDACXX_REQUIRES(_CUDA_VMR::async_resource<_ThisResource>) + _CCCL_NODISCARD void* async_allocate(size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) + { + return this->__control_block->__resource.async_allocate(__bytes, __alignment, __stream); + } + + //! @brief Enqueues the deallocation of memory pointed to by \c __ptr. The deallocation is + //! performed asynchronously on stream \c __stream. + //! @pre \c _Resource must satisfy \c async_resource. + //! @param __bytes The number of bytes that was passed to the `async_allocate` call that returned + //! \p __ptr. + //! @param __alignment The alignment that was passed to the `async_allocate` call that returned + //! \p __ptr. + //! @note The caller is responsible for ensuring that the memory is not accessed after the + //! operation has completed. + _LIBCUDACXX_TEMPLATE(class _ThisResource = _Resource) + _LIBCUDACXX_REQUIRES(_CUDA_VMR::async_resource<_ThisResource>) + void async_deallocate(void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) + { + this->__control_block->__resource.async_deallocate(__ptr, __bytes, __alignment, __stream); + } + + //! @brief Equality comparison between two \c shared_resource + //! @param __lhs The first \c shared_resource + //! @param __rhs The other \c shared_resource + //! @return Checks whether the objects refer to resources that compare equal. + _CCCL_NODISCARD_FRIEND bool operator==(const shared_resource& __lhs, const shared_resource& __rhs) + { + if (__lhs.__control_block == __rhs.__control_block) + { + return true; + } + + if (__lhs.__control_block == nullptr || __rhs.__control_block == nullptr) + { + return false; + } + + return __lhs.__control_block->__resource == __rhs.__control_block->__resource; + } + + //! @brief Equality comparison between two \c shared_resource + //! @param __lhs The first \c shared_resource + //! @param __rhs The other \c shared_resource + //! @return Checks whether the objects refer to resources that compare unequal. + _CCCL_NODISCARD_FRIEND bool operator!=(const shared_resource& __lhs, const shared_resource& __rhs) + { + return !(__lhs == __rhs); + } + + //! @brief Forwards the stateless properties + _LIBCUDACXX_TEMPLATE(class _Property) + _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND(has_property<_Resource, _Property>)) + friend void get_property(const shared_resource&, _Property) noexcept {} + + //! @brief Forwards the stateful properties + _LIBCUDACXX_TEMPLATE(class _Property) + _LIBCUDACXX_REQUIRES(property_with_value<_Property> _LIBCUDACXX_AND(has_property<_Resource, _Property>)) + _CCCL_NODISCARD_FRIEND __property_value_t<_Property> get_property(const shared_resource& __self, _Property) noexcept + { + return get_property(__self.__control_block->__resource, _Property{}); + } + +private: + // Use a custom shared_ptr implementation because (a) we don't need to support weak_ptr so we only + // need one pointer, not two, and (b) this implementation can work on device also. + struct _Control_block + { + _Resource __resource; + _CUDA_VSTD::atomic __ref_count; + }; + + _Control_block* __control_block; +}; + +//! @rst +//! .. _cudax-memory-resource-make-shared-resource: +//! +//! Factory function for `shared_resource` objects +//! ----------------------------------------------- +//! +//! ``make_any_resource`` constructs an :ref:`shared_resource ` object that wraps +//! a newly constructed instance of the given resource type. The resource type must satisfy the ``cuda::mr::resource`` +//! concept and provide all of the properties specified in the template parameter pack. +//! +//! @param __args The arguments used to construct the instance of the resource type. +//! +//! @endrst +template +auto make_shared_resource(_Args&&... __args) -> shared_resource<_Resource> +{ + static_assert(_CUDA_VMR::resource<_Resource>, "_Resource does not satisfy the cuda::mr::resource concept"); + return shared_resource<_Resource>{_CUDA_VSTD::forward<_Args>(__args)...}; +} + +} // namespace cuda::experimental::mr + +#endif // _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H diff --git a/cudax/include/cuda/experimental/memory_resource.cuh b/cudax/include/cuda/experimental/memory_resource.cuh index 3ebce76451..d84559142e 100644 --- a/cudax/include/cuda/experimental/memory_resource.cuh +++ b/cudax/include/cuda/experimental/memory_resource.cuh @@ -14,5 +14,6 @@ #include #include #include +#include #endif // __CUDAX_MEMORY_RESOURCE___ diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index ebb8f88c4a..a1b554af92 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -96,6 +96,7 @@ foreach(cn_target IN LISTS cudax_TARGETS) memory_resource/any_resource.cu memory_resource/async_memory_pool.cu memory_resource/async_memory_resource.cu + memory_resource/shared_resource.cu ) cudax_add_catch2_test(test_target async_tests ${cn_target} diff --git a/cudax/test/memory_resource/shared_resource.cu b/cudax/test/memory_resource/shared_resource.cu new file mode 100644 index 0000000000..74d8376f55 --- /dev/null +++ b/cudax/test/memory_resource/shared_resource.cu @@ -0,0 +1,168 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "test_resource.h" +#include +#include + +TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource]", big_resource, small_resource) +{ + using TestResource = TestType; + + SECTION("construct and destruct") + { + Counts expected{}; + CHECK(this->counts == expected); + { + cudax::mr::shared_resource mr{42, this}; + ++expected.object_count; + CHECK(this->counts == expected); + } + + --expected.object_count; + CHECK(this->counts == expected); + } + + // Reset the counters: + this->counts = Counts(); + + SECTION("copy and move") + { + Counts expected{}; + CHECK(this->counts == expected); + { + cudax::mr::shared_resource mr{42, this}; + ++expected.object_count; + CHECK(this->counts == expected); + + auto mr2 = mr; + CHECK(this->counts == expected); + CHECK(mr == mr2); // pointers compare equal, no call to TestResource::operator== + CHECK(this->counts == expected); + + auto mr3 = std::move(mr); + CHECK(this->counts == expected); + CHECK(mr2 == mr3); // pointers compare equal, no call to TestResource::operator== + CHECK(this->counts == expected); + + cudax::mr::shared_resource mr4{TestResource{42, this}}; + ++expected.object_count; + ++expected.move_count; + CHECK(mr3 == mr4); // pointers are not equal, calls TestResource::operator== + ++expected.equal_to_count; + CHECK(this->counts == expected); + } + + expected.object_count -= 2; + CHECK(this->counts == expected); + } + + // Reset the counters: + this->counts = Counts(); + + SECTION("allocate and deallocate") + { + Counts expected{}; + CHECK(this->counts == expected); + { + cudax::mr::shared_resource mr{42, this}; + ++expected.object_count; + CHECK(this->counts == expected); + + void* ptr = mr.allocate(bytes(50), align(8)); + CHECK(ptr == this); + ++expected.allocate_count; + CHECK(this->counts == expected); + + mr.deallocate(ptr, bytes(50), align(8)); + ++expected.deallocate_count; + CHECK(this->counts == expected); + } + + --expected.object_count; + CHECK(this->counts == expected); + } + + // Reset the counters: + this->counts = Counts(); + + SECTION("conversion to resource_ref") + { + Counts expected{}; + { + cudax::mr::shared_resource mr{42, this}; + ++expected.object_count; + CHECK(this->counts == expected); + + cuda::mr::resource_ref<> ref = mr; + + CHECK(this->counts == expected); + auto* ptr = ref.allocate(bytes(100), align(8)); + CHECK(ptr == this); + ++expected.allocate_count; + CHECK(this->counts == expected); + ref.deallocate(ptr, bytes(0), align(0)); + ++expected.deallocate_count; + CHECK(this->counts == expected); + } + --expected.object_count; + CHECK(this->counts == expected); + } + + // Reset the counters: + this->counts = Counts(); + + SECTION("basic sanity test about shared resource handling") + { + Counts expected{}; + align(alignof(int) * 4); + { + bytes(42 * sizeof(int)); + cudax::uninitialized_buffer buffer{cudax::mr::shared_resource(42, this), 42}; + ++expected.object_count; + ++expected.allocate_count; + CHECK(this->counts == expected); + + // copying the shared_resource should not copy the stored resource + { + // accounting for new storage + bytes(1337 * sizeof(int)); + cudax::uninitialized_buffer other_buffer{buffer.get_resource(), 1337}; + ++expected.allocate_count; + CHECK(this->counts == expected); + } + + // The original resource is still alive, but the second allocation was released + bytes(42 * sizeof(int)); + ++expected.deallocate_count; + CHECK(this->counts == expected); + + { + // Moving the resource should not do anything + cudax::uninitialized_buffer third_buffer = ::cuda::std::move(buffer); + CHECK(this->counts == expected); + } + + // The original shared_resource has been moved from so everything is gone already + --expected.object_count; + ++expected.deallocate_count; + CHECK(this->counts == expected); + } + + // Nothing changes here as the first shared_resources has been moved from + CHECK(this->counts == expected); + } + + // Reset the counters: + this->counts = Counts(); +} diff --git a/docs/cudax/memory_resource.rst b/docs/cudax/memory_resource.rst index ed576b6ccd..e37c16c30a 100644 --- a/docs/cudax/memory_resource.rst +++ b/docs/cudax/memory_resource.rst @@ -12,6 +12,7 @@ Memory Resources ${repo_docs_api_path}/struct*async__memory__pool__properties* ${repo_docs_api_path}/class*async__memory__pool* ${repo_docs_api_path}/class*async__memory__resource* + ${repo_docs_api_path}/*shared__resource* The ```` header provides: - :ref:`any_resource ` and @@ -22,6 +23,9 @@ The ```` header provides: *stream-ordered* memory allocation tailored to the needs of CUDA C++ developers. This design builds off of the success of the `RAPIDS Memory Manager (RMM) `__ project and evolves the design based on lessons learned. + - :ref:`shared_resource ` a type erased reference counted memory resource. + In contrast to :ref:`any_resource ` it additionally provides shared ownership + semantics. ```` is not intended to replace RMM, but instead moves the definition of the memory allocation interface to a more centralized home in CCCL. RMM will remain as a collection of implementations of diff --git a/libcudacxx/test/libcudacxx/CMakeLists.txt b/libcudacxx/test/libcudacxx/CMakeLists.txt index b699b4b89f..605c98e888 100644 --- a/libcudacxx/test/libcudacxx/CMakeLists.txt +++ b/libcudacxx/test/libcudacxx/CMakeLists.txt @@ -112,7 +112,8 @@ if (NOT LIBCUDACXX_TEST_WITH_NVRTC) add_custom_target(libcudacxx.test.lit.precompile DEPENDS libcudacxx.test.public_headers libcudacxx.test.internal_headers libcudacxx.test.public_headers_host_only COMMAND "${CMAKE_COMMAND}" -E env "LIBCUDACXX_SITE_CONFIG=${lit_site_cfg_path}" - "${libcudacxx_LIT}" -vv --no-progress-bar ${libcudacxx_LIT_FLAGS} "-Dexecutor=\"NoopExecutor()\"" "${libcudacxx_SOURCE_DIR}/test/libcudacxx" + "${libcudacxx_LIT}" -vv --no-progress-bar --time-tests ${libcudacxx_LIT_FLAGS} + "-Dexecutor=\"NoopExecutor()\"" "${libcudacxx_SOURCE_DIR}/test/libcudacxx" ) endif() @@ -125,12 +126,11 @@ set(libcudacxx_LIT_PARALLEL_LEVEL 8 CACHE STRING add_test(NAME libcudacxx.test.lit COMMAND "${CMAKE_COMMAND}" -E env "LIBCUDACXX_SITE_CONFIG=${lit_site_cfg_path}" - "${libcudacxx_LIT}" -vv --no-progress-bar ${libcudacxx_LIT_FLAGS} - -j "${libcudacxx_LIT_PARALLEL_LEVEL}" - "${libcudacxx_SOURCE_DIR}/test/libcudacxx" + "${libcudacxx_LIT}" -vv --no-progress-bar --time-tests ${libcudacxx_LIT_FLAGS} + -j "${libcudacxx_LIT_PARALLEL_LEVEL}" "${libcudacxx_SOURCE_DIR}/test/libcudacxx" ) set_tests_properties(libcudacxx.test.lit PROPERTIES - TIMEOUT 4800 + TIMEOUT 5000 RUN_SERIAL TRUE )