From 7bd04adda86e3f00e4a36387d20df1f2823c9ece Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 19 Sep 2024 19:23:48 +0200
Subject: [PATCH 1/3] Implement `cudax::shared_resource` (#2398)

* Implement `cudax::shared_resource`

We currently have two basic building blocks around memory resources, `any_resource` and `resource_ref`.

However, while they make owning and sharing resources much easier, we can still run into lifetime issues.

If a user wants to pass a resource into a library function that might exceed the lifetime of the resource, they would need to move it into an any_resource.

However, they also might want to share that resource among multiple functions, e.g a pool allocator. We need a way to properly share a resource in those circumstances.

Enter `shared_resource`. Rather than storing an `any_resource` this holds a `shared_ptr<any_resource>`.  With that we can happily copy / move them around and without touching the stored resource.

Co-authored-by: Eric Niebler <eniebler@nvidia.com>
Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com>
---
 .../__memory_resource/shared_resource.cuh     | 273 ++++++++++++++++++
 .../cuda/experimental/memory_resource.cuh     |   1 +
 cudax/test/CMakeLists.txt                     |   1 +
 cudax/test/memory_resource/any_resource.cu    | 177 +-----------
 cudax/test/memory_resource/shared_resource.cu | 168 +++++++++++
 cudax/test/memory_resource/test_resource.h    | 179 ++++++++++++
 docs/cudax/memory_resource.rst                |   4 +
 7 files changed, 627 insertions(+), 176 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
 create mode 100644 cudax/test/memory_resource/shared_resource.cu
 create mode 100644 cudax/test/memory_resource/test_resource.h

diff --git a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
new file mode 100644
index 0000000000..bdd774f216
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
@@ -0,0 +1,273 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H
+#define _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// If the memory resource header was included without the experimental flag,
+// tell the user to define the experimental flag.
+#if defined(_CUDA_MEMORY_RESOURCE) && !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#  error "To use the experimental memory resource, define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
+#endif
+
+// cuda::mr is unavable on MSVC 2017
+#if defined(_CCCL_COMPILER_MSVC_2017)
+#  error "The shared_resource header is not supported on MSVC 2017"
+#endif
+
+#if !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#  define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif
+
+#include <cuda/__memory_resource/resource.h>
+#include <cuda/std/__new_>
+#include <cuda/std/__type_traits/is_swappable.h>
+#include <cuda/std/__utility/exchange.h>
+#include <cuda/std/__utility/forward.h>
+#include <cuda/std/__utility/move.h>
+#include <cuda/std/atomic>
+
+namespace cuda::experimental::mr
+{
+
+//! @rst
+//! .. _cudax-memory-resource-shared-resource:
+//!
+//! Resource wrapper to share ownership of a resource
+//! --------------------------------------------------
+//!
+//! ``shared_resource`` holds a reference counted instance of a memory resource. This allows
+//! the user to pass a resource around with reference semantics while avoiding lifetime issues.
+//!
+//! @note ``shared_resource`` satisfies the ``cuda::mr::async_resource`` concept iff \tparam _Resource satisfies it.
+//! @tparam _Resource The resource type to hold.
+//! @endrst
+template <class _Resource>
+struct shared_resource
+{
+  static_assert(_CUDA_VMR::resource<_Resource>, "");
+
+  //! @brief Constructs a \c shared_resource refering to an object of type \c _Resource
+  //! that has been constructed with arguments \c __args. The \c _Resource object is
+  //! dynamically allocated with \c new.
+  //! @param __args The arguments to be passed to the \c _Resource constructor.
+  template <class... _Args>
+  explicit shared_resource(_Args&&... __args)
+      : __control_block(new _Control_block{_Resource{_CUDA_VSTD::forward<_Args>(__args)...}, 1})
+  {}
+
+  //! @brief Copy-constructs a \c shared_resource object resulting in an copy that shares
+  //! ownership of the wrapped resource with \c __other.
+  //! @param __other The \c shared_resource object to copy from.
+  shared_resource(const shared_resource& __other) noexcept
+      : __control_block(__other.__control_block)
+  {
+    if (__control_block)
+    {
+      __control_block->__ref_count.fetch_add(1, _CUDA_VSTD::memory_order_relaxed);
+    }
+  }
+
+  //! @brief Move-constructs a \c shared_resource assuming ownership of the resource stored
+  //! in \c __other.
+  //! @param __other The \c shared_resource object to move from.
+  //! @post \c __other is left in a valid but unspecified state.
+  shared_resource(shared_resource&& __other) noexcept
+      : __control_block(_CUDA_VSTD::exchange(__other.__control_block, nullptr))
+  {}
+
+  //! @brief Releases the reference held by this \c shared_resource object. If this is the
+  //! last reference to the wrapped resource, the resource is deleted.
+  ~shared_resource()
+  {
+    if (__control_block && __control_block->__ref_count.fetch_sub(1, _CUDA_VSTD::memory_order_acq_rel) == 1)
+    {
+      delete __control_block;
+    }
+  }
+
+  //! @brief Copy-assigns from \c __other. Self-assignment is a no-op. Otherwise, the reference
+  //! held by this \c shared_resource object is released and a new reference is acquired to the
+  //! wrapped resource of \c __other, if any.
+  //! @param __other The \c shared_resource object to copy from.
+  shared_resource& operator=(const shared_resource& __other) noexcept
+  {
+    if (this != &__other)
+    {
+      shared_resource(__other).swap(*this);
+    }
+
+    return *this;
+  }
+
+  //! @brief Move-assigns from \c __other. Self-assignment is a no-op. Otherwise, the reference
+  //! held by this \c shared_resource object is released, while the reference held by \c __other
+  //! is transferred to this object.
+  //! @param __other The \c shared_resource object to move from.
+  /// @post \c __other is left in a valid but unspecified state.
+  shared_resource& operator=(shared_resource&& __other) noexcept
+  {
+    if (this != &__other)
+    {
+      shared_resource(_CUDA_VSTD::move(__other)).swap(*this);
+    }
+
+    return *this;
+  }
+
+  //! @brief Swaps a \c shared_resource with another one.
+  //! @param __other The other \c shared_resource.
+  void swap(shared_resource& __other) noexcept
+  {
+    _CUDA_VSTD::swap(__control_block, __other.__control_block);
+  }
+
+  //! @brief Swaps a \c shared_resource with another one.
+  //! @param __other The other \c shared_resource.
+  friend void swap(shared_resource& __left, shared_resource& __right) noexcept
+  {
+    __left.swap(__right);
+  }
+
+  //! @brief Allocate memory of size at least \p __bytes using the stored resource.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @return Pointer to the newly allocated memory
+  _CCCL_NODISCARD void* allocate(size_t __bytes, size_t __alignment = alignof(_CUDA_VSTD::max_align_t))
+  {
+    return __control_block->__resource.allocate(__bytes, __alignment);
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr using the stored resource.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
+  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
+  void deallocate(void* __ptr, size_t __bytes, size_t __alignment = alignof(_CUDA_VSTD::max_align_t)) noexcept
+  {
+    __control_block->__resource.deallocate(__ptr, __bytes, __alignment);
+  }
+
+  //! @brief Enqueues an allocation of memory of size at least \p __bytes using
+  //! the wrapped resource. The allocation is performed asynchronously on stream \c __stream.
+  //! @pre \c _Resource must satisfy \c async_resource.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @return Pointer to the newly allocated memory.
+  //! @note The caller is responsible for ensuring that the memory is not accessed until the
+  //! operation has completed.
+  _LIBCUDACXX_TEMPLATE(class _ThisResource = _Resource)
+  _LIBCUDACXX_REQUIRES(_CUDA_VMR::async_resource<_ThisResource>)
+  _CCCL_NODISCARD void* async_allocate(size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
+  {
+    return this->__control_block->__resource.async_allocate(__bytes, __alignment, __stream);
+  }
+
+  //! @brief Enqueues the deallocation of memory pointed to by \c __ptr. The deallocation is
+  //! performed asynchronously on stream \c __stream.
+  //! @pre \c _Resource must satisfy \c async_resource.
+  //! @param __bytes The number of bytes that was passed to the `async_allocate` call that returned
+  //! \p __ptr.
+  //! @param __alignment The alignment that was passed to the `async_allocate` call that returned
+  //! \p __ptr.
+  //! @note The caller is responsible for ensuring that the memory is not accessed after the
+  //! operation has completed.
+  _LIBCUDACXX_TEMPLATE(class _ThisResource = _Resource)
+  _LIBCUDACXX_REQUIRES(_CUDA_VMR::async_resource<_ThisResource>)
+  void async_deallocate(void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
+  {
+    this->__control_block->__resource.async_deallocate(__ptr, __bytes, __alignment, __stream);
+  }
+
+  //! @brief Equality comparison between two \c shared_resource
+  //! @param __lhs The first \c shared_resource
+  //! @param __rhs The other \c shared_resource
+  //! @return Checks whether the objects refer to resources that compare equal.
+  _CCCL_NODISCARD_FRIEND bool operator==(const shared_resource& __lhs, const shared_resource& __rhs)
+  {
+    if (__lhs.__control_block == __rhs.__control_block)
+    {
+      return true;
+    }
+
+    if (__lhs.__control_block == nullptr || __rhs.__control_block == nullptr)
+    {
+      return false;
+    }
+
+    return __lhs.__control_block->__resource == __rhs.__control_block->__resource;
+  }
+
+  //! @brief Equality comparison between two \c shared_resource
+  //! @param __lhs The first \c shared_resource
+  //! @param __rhs The other \c shared_resource
+  //! @return Checks whether the objects refer to resources that compare unequal.
+  _CCCL_NODISCARD_FRIEND bool operator!=(const shared_resource& __lhs, const shared_resource& __rhs)
+  {
+    return !(__lhs == __rhs);
+  }
+
+  //! @brief Forwards the stateless properties
+  _LIBCUDACXX_TEMPLATE(class _Property)
+  _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND(has_property<_Resource, _Property>))
+  friend void get_property(const shared_resource&, _Property) noexcept {}
+
+  //! @brief Forwards the stateful properties
+  _LIBCUDACXX_TEMPLATE(class _Property)
+  _LIBCUDACXX_REQUIRES(property_with_value<_Property> _LIBCUDACXX_AND(has_property<_Resource, _Property>))
+  _CCCL_NODISCARD_FRIEND __property_value_t<_Property> get_property(const shared_resource& __self, _Property) noexcept
+  {
+    return get_property(__self.__control_block->__resource, _Property{});
+  }
+
+private:
+  // Use a custom shared_ptr implementation because (a) we don't need to support weak_ptr so we only
+  // need one pointer, not two, and (b) this implementation can work on device also.
+  struct _Control_block
+  {
+    _Resource __resource;
+    _CUDA_VSTD::atomic<int> __ref_count;
+  };
+
+  _Control_block* __control_block;
+};
+
+//! @rst
+//! .. _cudax-memory-resource-make-shared-resource:
+//!
+//! Factory function for `shared_resource` objects
+//! -----------------------------------------------
+//!
+//! ``make_any_resource`` constructs an :ref:`shared_resource <cudax-memory-resource-shared-resource>` object that wraps
+//! a newly constructed instance of the given resource type. The resource type must satisfy the ``cuda::mr::resource``
+//! concept and provide all of the properties specified in the template parameter pack.
+//!
+//! @param __args The arguments used to construct the instance of the resource type.
+//!
+//! @endrst
+template <class _Resource, class... _Args>
+auto make_shared_resource(_Args&&... __args) -> shared_resource<_Resource>
+{
+  static_assert(_CUDA_VMR::resource<_Resource>, "_Resource does not satisfy the cuda::mr::resource concept");
+  return shared_resource<_Resource>{_CUDA_VSTD::forward<_Args>(__args)...};
+}
+
+} // namespace cuda::experimental::mr
+
+#endif // _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H
diff --git a/cudax/include/cuda/experimental/memory_resource.cuh b/cudax/include/cuda/experimental/memory_resource.cuh
index 3ebce76451..d84559142e 100644
--- a/cudax/include/cuda/experimental/memory_resource.cuh
+++ b/cudax/include/cuda/experimental/memory_resource.cuh
@@ -14,5 +14,6 @@
 #include <cuda/experimental/__memory_resource/any_resource.cuh>
 #include <cuda/experimental/__memory_resource/async_memory_pool.cuh>
 #include <cuda/experimental/__memory_resource/async_memory_resource.cuh>
+#include <cuda/experimental/__memory_resource/shared_resource.cuh>
 
 #endif // __CUDAX_MEMORY_RESOURCE___
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index a50ab0b1ce..3f9b29b1ab 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -95,6 +95,7 @@ foreach(cn_target IN LISTS cudax_TARGETS)
     memory_resource/any_resource.cu
     memory_resource/async_memory_pool.cu
     memory_resource/async_memory_resource.cu
+    memory_resource/shared_resource.cu
   )
 
   cudax_add_catch2_test(test_target async_tests ${cn_target}
diff --git a/cudax/test/memory_resource/any_resource.cu b/cudax/test/memory_resource/any_resource.cu
index ef6d1ef948..cc76f1755a 100644
--- a/cudax/test/memory_resource/any_resource.cu
+++ b/cudax/test/memory_resource/any_resource.cu
@@ -10,185 +10,10 @@
 
 #include <cuda/experimental/memory_resource.cuh>
 
-#include <cstddef>
-#include <cstdint>
-
-#include "cuda/std/detail/libcxx/include/cstddef"
+#include "test_resource.h"
 #include <catch2/catch.hpp>
 #include <testing.cuh>
 
-using std::size_t;
-using std::uintptr_t;
-
-struct Counts
-{
-  int object_count     = 0;
-  int move_count       = 0;
-  int copy_count       = 0;
-  int allocate_count   = 0;
-  int deallocate_count = 0;
-  int equal_to_count   = 0;
-  int new_count        = 0;
-  int delete_count     = 0;
-
-  friend std::ostream& operator<<(std::ostream& os, const Counts& counts)
-  {
-    return os
-        << "object: " << counts.object_count << ", " //
-        << "move: " << counts.move_count << ", " //
-        << "copy: " << counts.copy_count << ", " //
-        << "allocate: " << counts.allocate_count << ", " //
-        << "deallocate: " << counts.deallocate_count << ", " //
-        << "equal_to: " << counts.equal_to_count << ", " //
-        << "new: " << counts.new_count << ", " //
-        << "delete: " << counts.delete_count;
-  }
-
-  friend bool operator==(const Counts& lhs, const Counts& rhs) noexcept
-  {
-    return lhs.object_count == rhs.object_count && //
-           lhs.move_count == rhs.move_count && //
-           lhs.copy_count == rhs.copy_count && //
-           lhs.allocate_count == rhs.allocate_count && //
-           lhs.deallocate_count == rhs.deallocate_count && //
-           lhs.equal_to_count == rhs.equal_to_count && //
-           lhs.new_count == rhs.new_count && //
-           lhs.delete_count == rhs.delete_count; //
-  }
-
-  friend bool operator!=(const Counts& lhs, const Counts& rhs) noexcept
-  {
-    return !(lhs == rhs);
-  }
-};
-
-struct test_fixture_
-{
-  Counts counts;
-  size_t bytes_ = 0;
-  size_t align_ = 0;
-  static thread_local Counts* counts_;
-
-  test_fixture_() noexcept
-      : counts()
-  {
-    counts_ = &counts;
-  }
-
-  size_t bytes(size_t sz) noexcept
-  {
-    bytes_ = sz;
-    return bytes_;
-  }
-
-  size_t align(size_t align) noexcept
-  {
-    align_ = align;
-    return align_;
-  }
-};
-
-thread_local Counts* test_fixture_::counts_ = nullptr;
-
-template <class>
-using test_fixture = test_fixture_;
-
-template <class T>
-struct test_resource
-{
-  int data;
-  test_fixture_* fixture;
-  T cookie[2] = {0xDEADBEEF, 0xDEADBEEF};
-
-  explicit test_resource(int i, test_fixture_* fix) noexcept
-      : data(i)
-      , fixture(fix)
-  {
-    ++fixture->counts.object_count;
-  }
-
-  test_resource(test_resource&& other) noexcept
-      : data(other.data)
-      , fixture(other.fixture)
-  {
-    other._assert_valid();
-    ++fixture->counts.move_count;
-    ++fixture->counts.object_count;
-    other.cookie[0] = other.cookie[1] = 0x0C07FEFE;
-  }
-
-  test_resource(const test_resource& other) noexcept
-      : data(other.data)
-      , fixture(other.fixture)
-  {
-    other._assert_valid();
-    ++fixture->counts.copy_count;
-    ++fixture->counts.object_count;
-  }
-
-  ~test_resource()
-  {
-    --fixture->counts.object_count;
-  }
-
-  void* allocate(std::size_t bytes, std::size_t align)
-  {
-    _assert_valid();
-    CHECK(bytes == fixture->bytes_);
-    CHECK(align == fixture->align_);
-    ++fixture->counts.allocate_count;
-    return fixture;
-  }
-
-  void deallocate(void* ptr, std::size_t bytes, std::size_t align) noexcept
-  {
-    _assert_valid();
-    CHECK(ptr == fixture);
-    CHECK(bytes == fixture->bytes_);
-    CHECK(align == fixture->align_);
-    ++fixture->counts.deallocate_count;
-    return;
-  }
-
-  friend bool operator==(const test_resource& lhs, const test_resource& rhs)
-  {
-    lhs._assert_valid();
-    rhs._assert_valid();
-    ++lhs.fixture->counts.equal_to_count;
-    return lhs.data == rhs.data;
-  }
-
-  friend bool operator!=(const test_resource& lhs, const test_resource& rhs)
-  {
-    FAIL("any_resource should only be calling operator==");
-    return lhs.data != rhs.data;
-  }
-
-  void _assert_valid() const noexcept
-  {
-    REQUIRE(cookie[0] == 0xDEADBEEF);
-    REQUIRE(cookie[1] == 0xDEADBEEF);
-  }
-
-  static void* operator new(::cuda::std::size_t size)
-  {
-    ++test_fixture_::counts_->new_count;
-    return ::operator new(size);
-  }
-
-  static void operator delete(void* pv) noexcept
-  {
-    ++test_fixture_::counts_->delete_count;
-    return ::operator delete(pv);
-  }
-};
-
-using big_resource   = test_resource<uintptr_t>;
-using small_resource = test_resource<unsigned int>;
-
-static_assert(sizeof(big_resource) > sizeof(cuda::mr::_AnyResourceStorage));
-static_assert(sizeof(small_resource) <= sizeof(cuda::mr::_AnyResourceStorage));
-
 TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]", big_resource, small_resource)
 {
   using TestResource    = TestType;
diff --git a/cudax/test/memory_resource/shared_resource.cu b/cudax/test/memory_resource/shared_resource.cu
new file mode 100644
index 0000000000..74d8376f55
--- /dev/null
+++ b/cudax/test/memory_resource/shared_resource.cu
@@ -0,0 +1,168 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/buffer.cuh>
+#include <cuda/experimental/memory_resource.cuh>
+
+#include "test_resource.h"
+#include <catch2/catch.hpp>
+#include <testing.cuh>
+
+TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource]", big_resource, small_resource)
+{
+  using TestResource = TestType;
+
+  SECTION("construct and destruct")
+  {
+    Counts expected{};
+    CHECK(this->counts == expected);
+    {
+      cudax::mr::shared_resource<TestResource> mr{42, this};
+      ++expected.object_count;
+      CHECK(this->counts == expected);
+    }
+
+    --expected.object_count;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+
+  SECTION("copy and move")
+  {
+    Counts expected{};
+    CHECK(this->counts == expected);
+    {
+      cudax::mr::shared_resource<TestResource> mr{42, this};
+      ++expected.object_count;
+      CHECK(this->counts == expected);
+
+      auto mr2 = mr;
+      CHECK(this->counts == expected);
+      CHECK(mr == mr2); // pointers compare equal, no call to TestResource::operator==
+      CHECK(this->counts == expected);
+
+      auto mr3 = std::move(mr);
+      CHECK(this->counts == expected);
+      CHECK(mr2 == mr3); // pointers compare equal, no call to TestResource::operator==
+      CHECK(this->counts == expected);
+
+      cudax::mr::shared_resource<TestResource> mr4{TestResource{42, this}};
+      ++expected.object_count;
+      ++expected.move_count;
+      CHECK(mr3 == mr4); // pointers are not equal, calls TestResource::operator==
+      ++expected.equal_to_count;
+      CHECK(this->counts == expected);
+    }
+
+    expected.object_count -= 2;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+
+  SECTION("allocate and deallocate")
+  {
+    Counts expected{};
+    CHECK(this->counts == expected);
+    {
+      cudax::mr::shared_resource<TestResource> mr{42, this};
+      ++expected.object_count;
+      CHECK(this->counts == expected);
+
+      void* ptr = mr.allocate(bytes(50), align(8));
+      CHECK(ptr == this);
+      ++expected.allocate_count;
+      CHECK(this->counts == expected);
+
+      mr.deallocate(ptr, bytes(50), align(8));
+      ++expected.deallocate_count;
+      CHECK(this->counts == expected);
+    }
+
+    --expected.object_count;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+
+  SECTION("conversion to resource_ref")
+  {
+    Counts expected{};
+    {
+      cudax::mr::shared_resource<TestResource> mr{42, this};
+      ++expected.object_count;
+      CHECK(this->counts == expected);
+
+      cuda::mr::resource_ref<> ref = mr;
+
+      CHECK(this->counts == expected);
+      auto* ptr = ref.allocate(bytes(100), align(8));
+      CHECK(ptr == this);
+      ++expected.allocate_count;
+      CHECK(this->counts == expected);
+      ref.deallocate(ptr, bytes(0), align(0));
+      ++expected.deallocate_count;
+      CHECK(this->counts == expected);
+    }
+    --expected.object_count;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+
+  SECTION("basic sanity test about shared resource handling")
+  {
+    Counts expected{};
+    align(alignof(int) * 4);
+    {
+      bytes(42 * sizeof(int));
+      cudax::uninitialized_buffer<int> buffer{cudax::mr::shared_resource<TestResource>(42, this), 42};
+      ++expected.object_count;
+      ++expected.allocate_count;
+      CHECK(this->counts == expected);
+
+      // copying the shared_resource should not copy the stored resource
+      {
+        // accounting for new storage
+        bytes(1337 * sizeof(int));
+        cudax::uninitialized_buffer<int> other_buffer{buffer.get_resource(), 1337};
+        ++expected.allocate_count;
+        CHECK(this->counts == expected);
+      }
+
+      // The original resource is still alive, but the second allocation was released
+      bytes(42 * sizeof(int));
+      ++expected.deallocate_count;
+      CHECK(this->counts == expected);
+
+      {
+        // Moving the resource should not do anything
+        cudax::uninitialized_buffer<int> third_buffer = ::cuda::std::move(buffer);
+        CHECK(this->counts == expected);
+      }
+
+      // The original shared_resource has been moved from so everything is gone already
+      --expected.object_count;
+      ++expected.deallocate_count;
+      CHECK(this->counts == expected);
+    }
+
+    // Nothing changes here as the first shared_resources has been moved from
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+}
diff --git a/cudax/test/memory_resource/test_resource.h b/cudax/test/memory_resource/test_resource.h
new file mode 100644
index 0000000000..dfc103b1ac
--- /dev/null
+++ b/cudax/test/memory_resource/test_resource.h
@@ -0,0 +1,179 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <catch2/catch.hpp>
+#include <testing.cuh>
+
+using std::size_t;
+using std::uintptr_t;
+
+struct Counts
+{
+  int object_count     = 0;
+  int move_count       = 0;
+  int copy_count       = 0;
+  int allocate_count   = 0;
+  int deallocate_count = 0;
+  int equal_to_count   = 0;
+  int new_count        = 0;
+  int delete_count     = 0;
+
+  friend std::ostream& operator<<(std::ostream& os, const Counts& counts)
+  {
+    return os
+        << "object: " << counts.object_count << ", " //
+        << "move: " << counts.move_count << ", " //
+        << "copy: " << counts.copy_count << ", " //
+        << "allocate: " << counts.allocate_count << ", " //
+        << "deallocate: " << counts.deallocate_count << ", " //
+        << "equal_to: " << counts.equal_to_count << ", " //
+        << "new: " << counts.new_count << ", " //
+        << "delete: " << counts.delete_count;
+  }
+
+  friend bool operator==(const Counts& lhs, const Counts& rhs) noexcept
+  {
+    return lhs.object_count == rhs.object_count && //
+           lhs.move_count == rhs.move_count && //
+           lhs.copy_count == rhs.copy_count && //
+           lhs.allocate_count == rhs.allocate_count && //
+           lhs.deallocate_count == rhs.deallocate_count && //
+           lhs.equal_to_count == rhs.equal_to_count && //
+           lhs.new_count == rhs.new_count && //
+           lhs.delete_count == rhs.delete_count; //
+  }
+
+  friend bool operator!=(const Counts& lhs, const Counts& rhs) noexcept
+  {
+    return !(lhs == rhs);
+  }
+};
+
+struct test_fixture_
+{
+  Counts counts;
+  size_t bytes_ = 0;
+  size_t align_ = 0;
+  static thread_local Counts* counts_;
+
+  test_fixture_() noexcept
+      : counts()
+  {
+    counts_ = &counts;
+  }
+
+  size_t bytes(size_t sz) noexcept
+  {
+    bytes_ = sz;
+    return bytes_;
+  }
+
+  size_t align(size_t align) noexcept
+  {
+    align_ = align;
+    return align_;
+  }
+};
+
+inline thread_local Counts* test_fixture_::counts_ = nullptr;
+
+template <class>
+using test_fixture = test_fixture_;
+
+template <class T>
+struct test_resource
+{
+  int data;
+  test_fixture_* fixture;
+  T cookie[2] = {0xDEADBEEF, 0xDEADBEEF};
+
+  explicit test_resource(int i, test_fixture_* fix) noexcept
+      : data(i)
+      , fixture(fix)
+  {
+    ++fixture->counts.object_count;
+  }
+
+  test_resource(test_resource&& other) noexcept
+      : data(other.data)
+      , fixture(other.fixture)
+  {
+    other._assert_valid();
+    ++fixture->counts.move_count;
+    ++fixture->counts.object_count;
+    other.cookie[0] = other.cookie[1] = 0x0C07FEFE;
+  }
+
+  test_resource(const test_resource& other) noexcept
+      : data(other.data)
+      , fixture(other.fixture)
+  {
+    other._assert_valid();
+    ++fixture->counts.copy_count;
+    ++fixture->counts.object_count;
+  }
+
+  ~test_resource()
+  {
+    --fixture->counts.object_count;
+  }
+
+  void* allocate(std::size_t bytes, std::size_t align)
+  {
+    _assert_valid();
+    CHECK(bytes == fixture->bytes_);
+    CHECK(align == fixture->align_);
+    ++fixture->counts.allocate_count;
+    return fixture;
+  }
+
+  void deallocate(void* ptr, std::size_t bytes, std::size_t align) noexcept
+  {
+    _assert_valid();
+    CHECK(ptr == fixture);
+    CHECK(bytes == fixture->bytes_);
+    CHECK(align == fixture->align_);
+    ++fixture->counts.deallocate_count;
+    return;
+  }
+
+  friend bool operator==(const test_resource& lhs, const test_resource& rhs)
+  {
+    lhs._assert_valid();
+    rhs._assert_valid();
+    ++lhs.fixture->counts.equal_to_count;
+    return lhs.data == rhs.data;
+  }
+
+  friend bool operator!=(const test_resource& lhs, const test_resource& rhs)
+  {
+    FAIL("any_resource should only be calling operator==");
+    return lhs.data != rhs.data;
+  }
+
+  void _assert_valid() const noexcept
+  {
+    REQUIRE(cookie[0] == 0xDEADBEEF);
+    REQUIRE(cookie[1] == 0xDEADBEEF);
+  }
+
+  static void* operator new(::cuda::std::size_t size)
+  {
+    ++test_fixture_::counts_->new_count;
+    return ::operator new(size);
+  }
+
+  static void operator delete(void* pv) noexcept
+  {
+    ++test_fixture_::counts_->delete_count;
+    return ::operator delete(pv);
+  }
+};
+
+using big_resource   = test_resource<uintptr_t>;
+using small_resource = test_resource<unsigned int>;
+
+static_assert(sizeof(big_resource) > sizeof(cuda::mr::_AnyResourceStorage));
+static_assert(sizeof(small_resource) <= sizeof(cuda::mr::_AnyResourceStorage));
diff --git a/docs/cudax/memory_resource.rst b/docs/cudax/memory_resource.rst
index 580fe6cd23..63736806e9 100644
--- a/docs/cudax/memory_resource.rst
+++ b/docs/cudax/memory_resource.rst
@@ -12,6 +12,7 @@ Memory Resources
    ${repo_docs_api_path}/struct*async__memory__pool__properties*
    ${repo_docs_api_path}/class*async__memory__pool*
    ${repo_docs_api_path}/class*async__memory__resource*
+   ${repo_docs_api_path}/*shared__resource*
 
 The ``<cuda/experimental/memory_resource.cuh>`` header provides:
    -  :ref:`any_resource <cudax-memory-resource-any-resource>` and
@@ -22,6 +23,9 @@ The ``<cuda/experimental/memory_resource.cuh>`` header provides:
       *stream-ordered* memory allocation tailored to the needs of CUDA C++ developers. This design builds off of the
       success of the `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ project and evolves the design
       based on lessons learned.
+   -  :ref:`shared_resource <cudax-memory-resource-shared-resource>` a type erased reference counted memory resource.
+      In contrast to :ref:`any_resource <cudax-memory-resource-any-resource>` it additionally provides shared ownership
+      semantics.
 
 ``<cuda/experimental/memory_resource.cuh>`` is not intended to replace RMM, but instead moves the definition of the
 memory allocation interface to a more centralized home in CCCL. RMM will remain as a collection of implementations of

From 5e14128f6b6e1cd522e618ad36782449c01158a9 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 19 Sep 2024 19:38:39 +0200
Subject: [PATCH 2/3] Increase the libcu++ timeout (#2435)

* Increase the libcu++ timeout

We are frequently running into  to the current test duration limit of 01:20, so add another 20 minutes to the timeout.

---------

Co-authored-by: Allison Piper <alliepiper16@gmail.com>
---
 libcudacxx/test/libcudacxx/CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/CMakeLists.txt b/libcudacxx/test/libcudacxx/CMakeLists.txt
index b699b4b89f..605c98e888 100644
--- a/libcudacxx/test/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/test/libcudacxx/CMakeLists.txt
@@ -112,7 +112,8 @@ if (NOT LIBCUDACXX_TEST_WITH_NVRTC)
   add_custom_target(libcudacxx.test.lit.precompile
     DEPENDS libcudacxx.test.public_headers libcudacxx.test.internal_headers libcudacxx.test.public_headers_host_only
     COMMAND "${CMAKE_COMMAND}" -E env "LIBCUDACXX_SITE_CONFIG=${lit_site_cfg_path}"
-    "${libcudacxx_LIT}" -vv --no-progress-bar ${libcudacxx_LIT_FLAGS} "-Dexecutor=\"NoopExecutor()\"" "${libcudacxx_SOURCE_DIR}/test/libcudacxx"
+    "${libcudacxx_LIT}" -vv --no-progress-bar --time-tests ${libcudacxx_LIT_FLAGS}
+    "-Dexecutor=\"NoopExecutor()\"" "${libcudacxx_SOURCE_DIR}/test/libcudacxx"
   )
 
 endif()
@@ -125,12 +126,11 @@ set(libcudacxx_LIT_PARALLEL_LEVEL 8 CACHE STRING
 add_test(NAME libcudacxx.test.lit COMMAND
   "${CMAKE_COMMAND}" -E env
     "LIBCUDACXX_SITE_CONFIG=${lit_site_cfg_path}"
-  "${libcudacxx_LIT}" -vv --no-progress-bar ${libcudacxx_LIT_FLAGS}
-    -j "${libcudacxx_LIT_PARALLEL_LEVEL}"
-      "${libcudacxx_SOURCE_DIR}/test/libcudacxx"
+  "${libcudacxx_LIT}" -vv --no-progress-bar --time-tests ${libcudacxx_LIT_FLAGS}
+    -j "${libcudacxx_LIT_PARALLEL_LEVEL}" "${libcudacxx_SOURCE_DIR}/test/libcudacxx"
 )
 
 set_tests_properties(libcudacxx.test.lit PROPERTIES
-  TIMEOUT 4800
+  TIMEOUT 5000
   RUN_SERIAL TRUE
 )

From 2fe09c8399bc1ea52d9681ad084ff87a6eaa3f79 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Sep 2024 11:18:23 -0700
Subject: [PATCH 3/3] Move c/include/cccl/*.h files to c/include/cccl/c/*.h
 (#2428)

* Move c/include/cccl/*.h files to c/include/cccl/c/*.h

* Change `#warning` to `#error` (to improve the user experience).

* Add comments to preprocessor conditionals.

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>

* Add comments to preprocessor conditionals.

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>

* Add comment to preprocessor conditional.

---------

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: Allison Piper <alliepiper16@gmail.com>
---
 c/include/cccl/{ => c}/reduce.h | 10 ++++------
 c/include/cccl/{ => c}/types.h  | 16 +++++++---------
 c/src/reduce.cu                 |  2 +-
 c/test/c2h.h                    |  2 +-
 4 files changed, 13 insertions(+), 17 deletions(-)
 rename c/include/cccl/{ => c}/reduce.h (85%)
 rename c/include/cccl/{ => c}/types.h (78%)

diff --git a/c/include/cccl/reduce.h b/c/include/cccl/c/reduce.h
similarity index 85%
rename from c/include/cccl/reduce.h
rename to c/include/cccl/c/reduce.h
index 5047625a85..1da7b51f01 100644
--- a/c/include/cccl/reduce.h
+++ b/c/include/cccl/c/reduce.h
@@ -11,12 +11,12 @@
 #pragma once
 
 #ifndef CCCL_C_EXPERIMENTAL
-#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
-#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+#  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
+#endif // !CCCL_C_EXPERIMENTAL
 
-#  include <cuda.h>
+#include <cuda.h>
 
-#  include <cccl/types.h>
+#include <cccl/c/types.h>
 
 struct cccl_device_reduce_build_result_t
 {
@@ -55,5 +55,3 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
   CUstream stream) noexcept;
 
 extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr);
-
-#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/include/cccl/types.h b/c/include/cccl/c/types.h
similarity index 78%
rename from c/include/cccl/types.h
rename to c/include/cccl/c/types.h
index 781b9f9ea6..f34466c755 100644
--- a/c/include/cccl/types.h
+++ b/c/include/cccl/c/types.h
@@ -11,14 +11,14 @@
 #pragma once
 
 #ifndef CCCL_C_EXPERIMENTAL
-#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
-#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+#  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
+#endif // !CCCL_C_EXPERIMENTAL
 
-#  if defined(_WIN32)
-#    define CCCL_C_API __declspec(dllexport)
-#  else
-#    define CCCL_C_API __attribute__((visibility("default")))
-#  endif
+#if defined(_WIN32)
+#  define CCCL_C_API __declspec(dllexport)
+#else // ^^^ _WIN32 ^^^ / vvv !_WIN32 vvv
+#  define CCCL_C_API __attribute__((visibility("default")))
+#endif // !_WIN32
 
 enum class cccl_type_enum
 {
@@ -81,5 +81,3 @@ struct cccl_iterator_t
   cccl_type_info value_type;
   void* state;
 };
-
-#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/src/reduce.cu b/c/src/reduce.cu
index 4badcd1ff0..fc88dd31a9 100644
--- a/c/src/reduce.cu
+++ b/c/src/reduce.cu
@@ -19,7 +19,7 @@
 #include <iostream>
 #include <memory>
 
-#include <cccl/reduce.h>
+#include <cccl/c/reduce.h>
 #include <nvJitLink.h>
 #include <nvrtc.h>
 
diff --git a/c/test/c2h.h b/c/test/c2h.h
index e2b26895a8..e044d2e17a 100644
--- a/c/test/c2h.h
+++ b/c/test/c2h.h
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include <catch2/catch.hpp>
-#include <cccl/reduce.h>
+#include <cccl/c/reduce.h>
 #include <nvrtc.h>
 
 static std::string inspect_sass(const void* cubin, size_t cubin_size)