From 4a32b1c28ba24dcfe56ef7bbe7f3990e06dd1f9a Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Fri, 6 Sep 2024 07:22:50 -0700
Subject: [PATCH] [CUDAX] add a small c++17 implementation of `std::execution`
 (aka P2300) (#2301)

---
 .clang-format                                 |   3 +
 cudax/cmake/cudaxHeaderTesting.cmake          |   6 +
 cudax/cmake/header_test.in.cu                 |   4 +-
 .../cuda/experimental/__async/async.cuh       |  48 ++
 .../experimental/__async/basic_sender.cuh     | 255 ++++++
 .../__async/completion_signatures.cuh         | 336 ++++++++
 .../cuda/experimental/__async/conditional.cuh | 238 ++++++
 .../cuda/experimental/__async/config.cuh      |  46 ++
 .../cuda/experimental/__async/continue_on.cuh | 288 +++++++
 .../cuda/experimental/__async/cpos.cuh        | 220 +++++
 .../include/cuda/experimental/__async/env.cuh | 192 +++++
 .../cuda/experimental/__async/epilogue.cuh    |  16 +
 .../cuda/experimental/__async/exception.cuh   |  46 ++
 .../cuda/experimental/__async/fwd_rcvr.cuh    |  71 ++
 .../cuda/experimental/__async/just.cuh        | 134 ++++
 .../cuda/experimental/__async/just_from.cuh   | 163 ++++
 .../cuda/experimental/__async/lazy.cuh        | 158 ++++
 .../cuda/experimental/__async/let_value.cuh   | 326 ++++++++
 .../cuda/experimental/__async/meta.cuh        | 753 ++++++++++++++++++
 .../cuda/experimental/__async/prologue.cuh    |  20 +
 .../cuda/experimental/__async/queries.cuh     | 167 ++++
 .../cuda/experimental/__async/rcvr_ref.cuh    |  51 ++
 .../experimental/__async/rcvr_with_env.cuh    | 141 ++++
 .../cuda/experimental/__async/read_env.cuh    | 155 ++++
 .../cuda/experimental/__async/run_loop.cuh    | 274 +++++++
 .../cuda/experimental/__async/sequence.cuh    | 151 ++++
 .../experimental/__async/start_detached.cuh   | 104 +++
 .../cuda/experimental/__async/start_on.cuh    | 150 ++++
 .../cuda/experimental/__async/stop_token.cuh  | 488 ++++++++++++
 .../cuda/experimental/__async/sync_wait.cuh   | 207 +++++
 .../cuda/experimental/__async/then.cuh        | 303 +++++++
 .../cuda/experimental/__async/thread.cuh      |  85 ++
 .../experimental/__async/thread_context.cuh   |  73 ++
 .../cuda/experimental/__async/tuple.cuh       | 104 +++
 .../cuda/experimental/__async/type_traits.cuh | 258 ++++++
 .../cuda/experimental/__async/utility.cuh     | 208 +++++
 .../cuda/experimental/__async/variant.cuh     | 192 +++++
 .../cuda/experimental/__async/when_all.cuh    | 650 +++++++++++++++
 .../cuda/experimental/__async/write_env.cuh   | 118 +++
 cudax/test/CMakeLists.txt                     |  12 +
 cudax/test/async/common/checked_receiver.cuh  | 125 +++
 cudax/test/async/common/error_scheduler.cuh   | 101 +++
 cudax/test/async/common/impulse_scheduler.cuh | 200 +++++
 cudax/test/async/common/inline_scheduler.cuh  |  81 ++
 cudax/test/async/common/stopped_scheduler.cuh |  85 ++
 cudax/test/async/common/utility.cuh           | 189 +++++
 cudax/test/async/test_conditional.cu          |  61 ++
 cudax/test/async/test_continue_on.cu          | 220 +++++
 cudax/test/async/test_just.cu                 |  18 +
 cudax/test/async/test_sequence.cu             |  54 ++
 cudax/test/async/test_when_all.cu             | 265 ++++++
 cudax/test/common/testing.cuh                 |   7 +-
 52 files changed, 8618 insertions(+), 2 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__async/async.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/basic_sender.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/completion_signatures.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/conditional.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/config.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/continue_on.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/cpos.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/env.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/epilogue.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/exception.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/fwd_rcvr.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/just.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/just_from.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/lazy.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/let_value.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/meta.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/prologue.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/queries.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/rcvr_ref.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/rcvr_with_env.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/read_env.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/run_loop.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/sequence.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/start_detached.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/start_on.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/stop_token.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/sync_wait.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/then.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/thread.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/thread_context.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/tuple.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/type_traits.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/utility.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/variant.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/when_all.cuh
 create mode 100644 cudax/include/cuda/experimental/__async/write_env.cuh
 create mode 100755 cudax/test/async/common/checked_receiver.cuh
 create mode 100755 cudax/test/async/common/error_scheduler.cuh
 create mode 100755 cudax/test/async/common/impulse_scheduler.cuh
 create mode 100755 cudax/test/async/common/inline_scheduler.cuh
 create mode 100755 cudax/test/async/common/stopped_scheduler.cuh
 create mode 100755 cudax/test/async/common/utility.cuh
 create mode 100755 cudax/test/async/test_conditional.cu
 create mode 100755 cudax/test/async/test_continue_on.cu
 create mode 100755 cudax/test/async/test_just.cu
 create mode 100755 cudax/test/async/test_sequence.cu
 create mode 100755 cudax/test/async/test_when_all.cu

diff --git a/.clang-format b/.clang-format
index 974957913f..6cba3dca4b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -90,6 +90,9 @@ IfMacros: [
 IndentWrappedFunctionNames: false
 IncludeBlocks:   Regroup
 IncludeCategories:
+  - Regex:           '^<cuda/experimental/__async/prologue.cuh>'
+    Priority:            0x7FFFFFFF
+    SortPriority:        0x7FFFFFFF
   - Regex:           '^<(cuda/std/detail/__config|cub/config.cuh|thrust/detail/config.h|thrust/system/cuda/config.h)'
     Priority:            0
     SortPriority:        0
diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake
index 29a3bd58ca..824b1a4fda 100644
--- a/cudax/cmake/cudaxHeaderTesting.cmake
+++ b/cudax/cmake/cudaxHeaderTesting.cmake
@@ -14,6 +14,12 @@ file(GLOB_RECURSE headers
   "${cudax_SOURCE_DIR}/include/*.h"
 )
 
+# The following internal headers are not required to compile independently:
+list(REMOVE_ITEM headers
+  "cuda/experimental/__async/prologue.cuh"
+  "cuda/experimental/__async/epilogue.cuh"
+)
+
 set(headertest_srcs)
 foreach (header IN LISTS headers)
   set(headertest_src "headers/${header}.cu")
diff --git a/cudax/cmake/header_test.in.cu b/cudax/cmake/header_test.in.cu
index 771ca319db..fd2df1987d 100644
--- a/cudax/cmake/header_test.in.cu
+++ b/cudax/cmake/header_test.in.cu
@@ -34,7 +34,9 @@
 #define I CUDAX_MACRO_CHECK('I', complex.h)
 
 // windows.h conflicts
-#define small CUDAX_MACRO_CHECK('small', windows.h)
+// @eniebler 2024-08-30: This test is disabled because it causes build
+// failures in some configurations.
+// #define small CUDAX_MACRO_CHECK('small', windows.h)
 // We can't enable these checks without breaking some builds -- some standard
 // library implementations unconditionally `#undef` these macros, which then
 // causes random failures later.
diff --git a/cudax/include/cuda/experimental/__async/async.cuh b/cudax/include/cuda/experimental/__async/async.cuh
new file mode 100644
index 0000000000..ed53717bca
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/async.cuh
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_ASYNC
+#define __CUDAX_ASYNC_DETAIL_ASYNC
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// Include this first
+#include <cuda/experimental/__async/config.cuh>
+
+// Include the other implementation headers:
+#include <cuda/experimental/__async/basic_sender.cuh>
+#include <cuda/experimental/__async/conditional.cuh>
+#include <cuda/experimental/__async/continue_on.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/just.cuh>
+#include <cuda/experimental/__async/just_from.cuh>
+#include <cuda/experimental/__async/let_value.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/read_env.cuh>
+#include <cuda/experimental/__async/run_loop.cuh>
+#include <cuda/experimental/__async/sequence.cuh>
+#include <cuda/experimental/__async/start_detached.cuh>
+#include <cuda/experimental/__async/start_on.cuh>
+#include <cuda/experimental/__async/stop_token.cuh>
+#include <cuda/experimental/__async/sync_wait.cuh>
+#include <cuda/experimental/__async/then.cuh>
+#include <cuda/experimental/__async/thread_context.cuh>
+#include <cuda/experimental/__async/when_all.cuh>
+#include <cuda/experimental/__async/write_env.cuh>
+
+#endif // __CUDAX_ASYNC_DETAIL_ASYNC
diff --git a/cudax/include/cuda/experimental/__async/basic_sender.cuh b/cudax/include/cuda/experimental/__async/basic_sender.cuh
new file mode 100644
index 0000000000..5730078ecc
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/basic_sender.cuh
@@ -0,0 +1,255 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_BASIC_SENDER
+#define __CUDAX_ASYNC_DETAIL_BASIC_SENDER
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Data, class _Rcvr>
+struct __state
+{
+  _Data __data_;
+  _Rcvr __receiver_;
+};
+
+struct receiver_defaults
+{
+  using receiver_concept = __async::receiver_t;
+
+  template <class _Rcvr, class... _Args>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_value(__ignore, _Rcvr& __rcvr, _Args&&... __args) noexcept
+    -> __async::completion_signatures<__async::set_value_t(_Args...)>
+  {
+    __async::set_value(static_cast<_Rcvr&&>(__rcvr), static_cast<_Args&&>(__args)...);
+    return {};
+  }
+
+  template <class _Rcvr, class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_error(__ignore, _Rcvr& __rcvr, _Error&& __error) noexcept
+    -> __async::completion_signatures<__async::set_error_t(_Error)>
+  {
+    __async::set_error(static_cast<_Rcvr&&>(__rcvr), static_cast<_Error&&>(__error));
+    return {};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto
+  set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()>
+  {
+    __async::set_stopped(static_cast<_Rcvr&&>(__rcvr));
+    return {};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static decltype(auto) get_env(__ignore, const _Rcvr& __rcvr) noexcept
+  {
+    return __async::get_env(__rcvr);
+  }
+};
+
+template <class _Data, class _Rcvr>
+struct basic_receiver
+{
+  using receiver_concept = __async::receiver_t;
+  using __rcvr_t         = typename _Data::receiver_tag;
+  __state<_Data, _Rcvr>& __state_;
+
+  template <class... _Args>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_Args&&... __args) noexcept
+  {
+    __rcvr_t::set_value(__state_.__data_, __state_.__receiver_, (_Args&&) __args...);
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+  {
+    __rcvr_t::set_error(__state_.__data_, __state_.__receiver_, (_Error&&) __error);
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept
+  {
+    __rcvr_t::set_stopped(__state_.__data_, __state_.__receiver_);
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept
+  {
+    return __rcvr_t::get_env(__state_.__data_, __state_.__receiver_);
+  }
+};
+
+template <class _Rcvr>
+_CCCL_INLINE_VAR constexpr bool has_no_environment = _CUDA_VSTD::is_same_v<_Rcvr, receiver_archetype>;
+
+template <bool _HasStopped, class _Data, class _Rcvr>
+struct __mk_completions
+{
+  using __rcvr_t = typename _Data::receiver_tag;
+
+  template <class... _Args>
+  using __set_value_t =
+    decltype(+*__rcvr_t::set_value(__declval<_Data&>(), __declval<receiver_archetype&>(), __declval<_Args>()...));
+
+  template <class _Error>
+  using __set_error_t =
+    decltype(+*__rcvr_t::set_error(__declval<_Data&>(), __declval<receiver_archetype&>(), __declval<_Error>()));
+
+  using __set_stopped_t = __async::completion_signatures<>;
+};
+
+template <class _Data, class _Rcvr>
+struct __mk_completions<true, _Data, _Rcvr> : __mk_completions<false, _Data, _Rcvr>
+{
+  using __rcvr_t = typename _Data::receiver_tag;
+
+  using __set_stopped_t = decltype(+*__rcvr_t::set_stopped(__declval<_Data&>(), __declval<receiver_archetype&>()));
+};
+
+template <class...>
+using __ignore_value_signature = __async::completion_signatures<>;
+
+template <class>
+using __ignore_error_signature = __async::completion_signatures<>;
+
+template <class _Completions>
+constexpr bool __has_stopped =
+  !_CUDA_VSTD::is_same_v<__async::completion_signatures<>,
+                         __async::transform_completion_signatures<_Completions,
+                                                                  __async::completion_signatures<>,
+                                                                  __ignore_value_signature,
+                                                                  __ignore_error_signature>>;
+
+template <bool _PotentiallyThrowing, class _Rcvr>
+void set_current_exception_if([[maybe_unused]] _Rcvr& __rcvr) noexcept
+{
+  if constexpr (_PotentiallyThrowing)
+  {
+    __async::set_error(static_cast<_Rcvr&&>(__rcvr), ::std::current_exception());
+  }
+}
+
+// A generic type that holds the data for an async operation, and
+// that provides a `start` method for enqueuing the work.
+template <class _Sndr, class _Data, class _Rcvr>
+struct __basic_opstate
+{
+  using __rcvr_t        = basic_receiver<_Data, _Rcvr>;
+  using __completions_t = completion_signatures_of_t<_Sndr, __rcvr_t>;
+  using __traits_t      = __mk_completions<__has_stopped<__completions_t>, _Data, _Rcvr>;
+
+  using completion_signatures = //
+    transform_completion_signatures<__completions_t,
+                                    // TODO: add set_error_t(exception_ptr) if constructing
+                                    // the state or connecting the sender is potentially throwing.
+                                    __async::completion_signatures<>,
+                                    __traits_t::template __set_value_t,
+                                    __traits_t::template __set_error_t,
+                                    typename __traits_t::__set_stopped_t>;
+
+  _CCCL_HOST_DEVICE __basic_opstate(_Sndr&& __sndr, _Data __data, _Rcvr __rcvr)
+      : __state_{static_cast<_Data&&>(__data), static_cast<_Rcvr&&>(__rcvr)}
+      , __op_(__async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{__state_}))
+  {}
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void start() noexcept
+  {
+    __async::start(__op_);
+  }
+
+  __state<_Data, _Rcvr> __state_;
+  __async::connect_result_t<_Sndr, __rcvr_t> __op_;
+};
+
+template <class _Sndr, class _Rcvr>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr)
+{
+  auto [__tag, __data, __child] = static_cast<_Sndr&&>(__sndr);
+  using __data_t                = decltype(__data);
+  using __child_t               = decltype(__child);
+  (void) __tag;
+  return __basic_opstate(
+    static_cast<__child_t&&>(__child), static_cast<__data_t&&>(__data), static_cast<_Rcvr&&>(__rcvr));
+}
+
+template <class _Data, class... _Sndrs>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto
+__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...))
+{
+  return __data.get_attrs(__sndrs...);
+}
+
+template <class _Data, class... _Sndrs>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto
+__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...))
+{
+  return __async::get_env(__sndrs...);
+}
+
+template <class _Data, class... _Sndrs>
+struct basic_sender;
+
+template <class _Data, class _Sndr>
+struct basic_sender<_Data, _Sndr>
+{
+  using sender_concept = __async::sender_t;
+  using __tag_t        = typename _Data::sender_tag;
+  using __rcvr_t       = typename _Data::receiver_tag;
+
+  _CCCL_NO_UNIQUE_ADDRESS __tag_t __tag_;
+  _Data __data_;
+  _Sndr __sndr_;
+
+  // Connect the sender to the receiver (the continuation) and
+  // return the state_type object for this operation.
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) &&
+  {
+    return __make_opstate(static_cast<basic_sender&&>(*this), static_cast<_Rcvr&&>(__rcvr));
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) const&
+  {
+    return __make_opstate(*this, static_cast<_Rcvr&&>(__rcvr));
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept
+  {
+    return __async::__get_attrs(0, __data_, __sndr_);
+  }
+};
+
+template <class _Data, class... _Sndrs>
+basic_sender(__ignore, _Data, _Sndrs...) -> basic_sender<_Data, _Sndrs...>;
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/completion_signatures.cuh
new file mode 100644
index 0000000000..c4edf4b618
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/completion_signatures.cuh
@@ -0,0 +1,336 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES
+#define __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// A typelist for completion signatures
+template <class... _Ts>
+struct completion_signatures
+{};
+
+// A metafunction to determine if a type is a completion signature
+template <class>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature = false;
+
+template <class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature<set_value_t(_Ts...)> = true;
+
+template <class _Error>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature<set_error_t(_Error)> = true;
+
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature<set_stopped_t()> = true;
+
+// The implementation of transform_completion_signatures starts here
+template <class _Sig, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __undefined<_Sig> __transform_sig;
+
+template <class... _Values, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __fn_t<_Vy<_Values...>>* __transform_sig<set_value_t(_Values...), _Vy, _Ey, _Sy>;
+
+template <class _Error, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __fn_t<_Ey<_Error>>* __transform_sig<set_error_t(_Error), _Vy, _Ey, _Sy>;
+
+template <template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __fn_t<_Sy>* __transform_sig<set_stopped_t(), _Vy, _Ey, _Sy>;
+
+template <class _Sig, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+using __transform_sig_t = decltype(__transform_sig<_Sig, _Vy, _Ey, _Sy>());
+
+template <class _Sigs,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+extern _DIAGNOSTIC<_Sigs> __transform_completion_signatures_v;
+
+template <class... _What,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+extern __fn_t<_ERROR<_What...>>*
+  __transform_completion_signatures_v<_ERROR<_What...>, _Vy, _Ey, _Sy, _Variant, _More...>;
+
+template <class... _Sigs,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+extern __fn_t<_Variant<__transform_sig_t<_Sigs, _Vy, _Ey, _Sy>..., _More...>>*
+  __transform_completion_signatures_v<completion_signatures<_Sigs...>, _Vy, _Ey, _Sy, _Variant, _More...>;
+
+template <class _Sigs,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+using __transform_completion_signatures =
+  decltype(__transform_completion_signatures_v<_Sigs, _Vy, _Ey, _Sy, _Variant, _More...>());
+
+template <class _WantedTag>
+struct __gather_sigs_fn;
+
+template <>
+struct __gather_sigs_fn<set_value_t>
+{
+  template <class _Sigs,
+            template <class...>
+            class _Then,
+            template <class...>
+            class _Else,
+            template <class...>
+            class _Variant,
+            class... _More>
+  using __f = __transform_completion_signatures<
+    _Sigs,
+    _Then,
+    __mbind_front_q<_Else, set_error_t>::template __f,
+    _Else<set_stopped_t>,
+    _Variant,
+    _More...>;
+};
+
+template <>
+struct __gather_sigs_fn<set_error_t>
+{
+  template <class _Sigs,
+            template <class...>
+            class _Then,
+            template <class...>
+            class _Else,
+            template <class...>
+            class _Variant,
+            class... _More>
+  using __f = __transform_completion_signatures<
+    _Sigs,
+    __mbind_front_q<_Else, set_value_t>::template __f,
+    _Then,
+    _Else<set_stopped_t>,
+    _Variant,
+    _More...>;
+};
+
+template <>
+struct __gather_sigs_fn<set_stopped_t>
+{
+  template <class _Sigs,
+            template <class...>
+            class _Then,
+            template <class...>
+            class _Else,
+            template <class...>
+            class _Variant,
+            class... _More>
+  using __f = __transform_completion_signatures<
+    _Sigs,
+    __mbind_front_q<_Else, set_value_t>::template __f,
+    __mbind_front_q<_Else, set_error_t>::template __f,
+    _Then<>,
+    _Variant,
+    _More...>;
+};
+
+template <class _Sigs,
+          class _WantedTag,
+          template <class...>
+          class _Then,
+          template <class...>
+          class _Else,
+          template <class...>
+          class _Variant,
+          class... _More>
+using __gather_completion_signatures =
+  typename __gather_sigs_fn<_WantedTag>::template __f<_Sigs, _Then, _Else, _Variant, _More...>;
+
+template <class... _Ts>
+using __set_value_transform_t = completion_signatures<set_value_t(_Ts...)>;
+
+template <class _Ty>
+using __set_error_transform_t = completion_signatures<set_error_t(_Ty)>;
+
+template <class... _Ts, class... _Us>
+auto operator*(__mset<_Ts...>&, __undefined<completion_signatures<_Us...>>&) -> __mset_insert<__mset<_Ts...>, _Us...>&;
+
+template <class... _Ts, class... _What>
+auto operator*(__mset<_Ts...>&, __undefined<_ERROR<_What...>>&) -> _ERROR<_What...>&;
+
+template <class... _What, class... _Us>
+auto operator*(_ERROR<_What...>&, __undefined<completion_signatures<_Us...>>&) -> _ERROR<_What...>&;
+
+template <class... _Sigs>
+using __concat_completion_signatures = //
+  __mapply_q<completion_signatures, __mconcat_into_q<__mmake_set>::__f<_Sigs...>>;
+
+template <class _Tag, class... _Ts>
+using __default_completions = completion_signatures<_Tag(_Ts...)>;
+
+template <class _Sigs,
+          class _MoreSigs                           = completion_signatures<>,
+          template <class...> class _ValueTransform = __set_value_transform_t,
+          template <class> class _ErrorTransform    = __set_error_transform_t,
+          class _StoppedSigs                        = completion_signatures<set_stopped_t()>>
+using transform_completion_signatures = //
+  __transform_completion_signatures<_Sigs,
+                                    _ValueTransform,
+                                    _ErrorTransform,
+                                    _StoppedSigs,
+                                    __mtry_quote<__concat_completion_signatures>::__f,
+                                    _MoreSigs>;
+
+template <class _Sndr,
+          class _Rcvr,
+          class _MoreSigs                           = completion_signatures<>,
+          template <class...> class _ValueTransform = __set_value_transform_t,
+          template <class> class _ErrorTransform    = __set_error_transform_t,
+          class _StoppedSigs                        = completion_signatures<set_stopped_t()>>
+using transform_completion_signatures_of = //
+  transform_completion_signatures<completion_signatures_of_t<_Sndr, _Rcvr>,
+                                  _MoreSigs,
+                                  _ValueTransform,
+                                  _ErrorTransform,
+                                  _StoppedSigs>;
+
+template <class _Sigs,
+          template <class...>
+          class _Tuple,
+          template <class...>
+          class _Variant>
+using __value_types = //
+  __transform_completion_signatures<_Sigs,
+                                    __mcompose_q<__mlist, _Tuple>::template __f,
+                                    __malways<__mlist<>>::__f,
+                                    __mlist<>,
+                                    __mconcat_into_q<_Variant>::template __f>;
+
+template <class _Sndr, class _Rcvr, template <class...> class _Tuple, template <class...> class _Variant>
+using value_types_of_t =
+  __value_types<completion_signatures_of_t<_Sndr, _Rcvr>, _Tuple, __mtry_quote<_Variant>::template __f>;
+
+template <class _Sigs,
+          template <class...>
+          class _Variant>
+using __error_types = //
+  __transform_completion_signatures<_Sigs,
+                                    __malways<__mlist<>>::__f,
+                                    __mlist,
+                                    __mlist<>,
+                                    __mconcat_into_q<_Variant>::template __f>;
+
+template <class _Sndr, class _Rcvr, template <class...> class _Variant>
+using error_types_of_t = __error_types<completion_signatures_of_t<_Sndr, _Rcvr>, _Variant>;
+
+template <class _Sigs>
+_CCCL_INLINE_VAR constexpr bool __sends_stopped = //
+  __transform_completion_signatures<_Sigs, __malways<__mfalse>::__f, __malways<__mfalse>::__f, __mtrue, __mor>::__value;
+
+template <class _Sndr, class _Rcvr = receiver_archetype>
+_CCCL_INLINE_VAR constexpr bool sends_stopped = //
+  __sends_stopped<completion_signatures_of_t<_Sndr, _Rcvr>>;
+
+using __eptr_completion = completion_signatures<set_error_t(::std::exception_ptr)>;
+
+template <bool _NoExcept>
+using __eptr_completion_if = _CUDA_VSTD::_If<_NoExcept, completion_signatures<>, __eptr_completion>;
+
+template <class>
+_CCCL_INLINE_VAR constexpr bool __is_completion_signatures = false;
+
+template <class... _Sigs>
+_CCCL_INLINE_VAR constexpr bool __is_completion_signatures<completion_signatures<_Sigs...>> = true;
+
+template <class _Sndr>
+using __is_non_dependent_detail_ = //
+  __mif<__is_completion_signatures<completion_signatures_of_t<_Sndr>>>;
+
+template <class _Sndr>
+_CCCL_INLINE_VAR constexpr bool __is_non_dependent_sender = __mvalid_q<__is_non_dependent_detail_, _Sndr>;
+
+namespace __csig
+{
+struct __dep
+{};
+
+template <class... _Sigs>
+struct __sigs;
+
+template <class... _As, class... _Bs>
+auto operator+(__sigs<_As...>&, __sigs<_Bs...>&) -> __sigs<_As..., _Bs...>&;
+
+template <class... _Sigs>
+auto operator+(__sigs<_Sigs...>&) //
+  -> __concat_completion_signatures<completion_signatures<_Sigs...>>;
+
+template <class _Other>
+auto __to_sigs(_Other&) -> _Other&;
+
+template <class... _Sigs>
+auto __to_sigs(completion_signatures<_Sigs...>&) -> __sigs<_Sigs...>&;
+} // namespace __csig
+
+using dependent_completions = __csig::__dep;
+
+namespace meta
+{
+template <class... _Sigs>
+using sigs = __csig::__sigs<_Sigs...>*;
+
+template <class _Tag, class... _Args>
+auto completion(_Tag, _Args&&...) -> __csig::__sigs<_Tag(_Args...)>&;
+
+template <class _Sndr, class _Rcvr = receiver_archetype>
+auto completions_of(_Sndr&&,
+                    _Rcvr = {}) -> decltype(__csig::__to_sigs(__declval<completion_signatures_of_t<_Sndr, _Rcvr>&>()));
+
+template <bool _PotentiallyThrowing>
+auto eptr_completion_if()
+  -> _CUDA_VSTD::_If<_PotentiallyThrowing, __csig::__sigs<set_error_t(::std::exception_ptr)>, __csig::__sigs<>>&;
+} // namespace meta
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/conditional.cuh b/cudax/include/cuda/experimental/__async/conditional.cuh
new file mode 100644
index 0000000000..3a02e4eec2
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/conditional.cuh
@@ -0,0 +1,238 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CONDITIONAL
+#define __CUDAX_ASYNC_DETAIL_CONDITIONAL
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/just_from.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+//! \file conditional.cuh
+//! This file defines the \c conditional sender. \c conditional is a sender that
+//! selects between two continuations based on the result of a predecessor. It
+//! accepts a predecessor, a predicate, and two continuations. It passes the
+//! result of the predecessor to the predicate. If the predicate returns \c true,
+//! the result is passed to the first continuation; otherwise, it is passed to
+//! the second continuation.
+//!
+//! By "continuation", we mean a so-called sender adaptor closure: a unary function
+//! that takes a sender and returns a new sender. The expression `then(f)` is an
+//! example of a continuation.
+
+namespace cuda::experimental::__async
+{
+struct __cond_t
+{
+  template <class _Pred, class _Then, class _Else>
+  struct __data
+  {
+    _Pred __pred_;
+    _Then __then_;
+    _Else __else_;
+  };
+
+  template <class... _Args>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto __mk_complete_fn(_Args&&... __args) noexcept
+  {
+    return [&](auto __sink) noexcept {
+      return __sink(static_cast<_Args&&>(__args)...);
+    };
+  }
+
+  template <class... _Args>
+  using __just_from_t = decltype(just_from(__cond_t::__mk_complete_fn(__declval<_Args>()...)));
+
+  template <class _Sndr, class _Rcvr, class _Pred, class _Then, class _Else>
+  struct __opstate
+  {
+    using operation_state_concept = operation_state_t;
+
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate* __self) noexcept
+    {
+      return get_env(__self->__rcvr_);
+    }
+
+    template <class... _Args>
+    using __value_t = //
+      transform_completion_signatures<
+        completion_signatures_of_t<__call_result_t<_Then, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>,
+        completion_signatures_of_t<__call_result_t<_Else, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>>;
+
+    template <class... _Args>
+    using __opstate_t = //
+      __mlist< //
+        connect_result_t<__call_result_t<_Then, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>,
+        connect_result_t<__call_result_t<_Else, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>>;
+
+    using __next_ops_variant_t = //
+      __value_types<completion_signatures_of_t<_Sndr, __opstate*>, __opstate_t, __mconcat_into_q<__variant>::__f>;
+
+    using completion_signatures = //
+      transform_completion_signatures_of<_Sndr, __opstate*, __async::completion_signatures<>, __value_t>;
+
+    _CCCL_HOST_DEVICE __opstate(_Sndr&& __sndr, _Rcvr&& __rcvr, __data<_Pred, _Then, _Else>&& __data)
+        : __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+        , __data_{static_cast<__cond_t::__data<_Pred, _Then, _Else>>(__data)}
+        , __op_{__async::connect(static_cast<_Sndr&&>(__sndr), this)}
+    {}
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__op_);
+    }
+
+    template <class... _Args>
+    _CCCL_HOST_DEVICE void set_value(_Args&&... __args) noexcept
+    {
+      if (static_cast<_Pred&&>(__data_.__pred_)(__args...))
+      {
+        auto& __op = __ops_.__emplace_from(
+          connect,
+          static_cast<_Then&&>(__data_.__then_)(just_from(__cond_t::__mk_complete_fn(static_cast<_Args&&>(__args)...))),
+          __rcvr_ref(__rcvr_));
+        __async::start(__op);
+      }
+      else
+      {
+        auto& __op = __ops_.__emplace_from(
+          connect,
+          static_cast<_Else&&>(__data_.__else_)(just_from(__cond_t::__mk_complete_fn(static_cast<_Args&&>(__args)...))),
+          __rcvr_ref(__rcvr_));
+        __async::start(__op);
+      }
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __async::set_error(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __async::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
+    }
+
+    _Rcvr __rcvr_;
+    __cond_t::__data<_Pred, _Then, _Else> __data_;
+    connect_result_t<_Sndr, __opstate*> __op_;
+    __next_ops_variant_t __ops_;
+  };
+
+  template <class _Sndr, class _Pred, class _Then, class _Else>
+  struct __sndr_t;
+
+  template <class _Pred, class _Then, class _Else>
+  struct __closure
+  {
+    __cond_t::__data<_Pred, _Then, _Else> __data_;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __mk_sender(_Sndr&& __sndr) //
+      -> __sndr_t<_Sndr, _Pred, _Then, _Else>;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr) //
+      -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+    {
+      return __mk_sender(static_cast<_Sndr&&>(__sndr));
+    }
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, __closure&& __self) //
+      -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+    {
+      return __self.__mk_sender(static_cast<_Sndr&&>(__sndr));
+    }
+  };
+
+  template <class _Sndr, class _Pred, class _Then, class _Else>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const //
+    -> __sndr_t<_Sndr, _Pred, _Then, _Else>;
+
+  template <class _Pred, class _Then, class _Else>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Pred __pred, _Then __then, _Else __else) const
+  {
+    return __closure<_Pred, _Then, _Else>{
+      {static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)}};
+  }
+};
+
+template <class _Sndr, class _Pred, class _Then, class _Else>
+struct __cond_t::__sndr_t
+{
+  __cond_t __tag_;
+  __cond_t::__data<_Pred, _Then, _Else> __data_;
+  _Sndr __sndr_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate<_Sndr, _Rcvr, _Pred, _Then, _Else>
+  {
+    return {static_cast<_Sndr&&>(__sndr_),
+            static_cast<_Rcvr&&>(__rcvr),
+            static_cast<__cond_t::__data<_Pred, _Then, _Else>&&>(__data_)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& -> __opstate<_Sndr const&, _Rcvr, _Pred, _Then, _Else>
+  {
+    return {__sndr_, static_cast<_Rcvr&&>(__rcvr), static_cast<__cond_t::__data<_Pred, _Then, _Else>&&>(__data_)};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+  {
+    return __async::get_env(__sndr_);
+  }
+};
+
+template <class _Sndr, class _Pred, class _Then, class _Else>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto
+__cond_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const //
+  -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+{
+  if constexpr (__is_non_dependent_sender<_Sndr>)
+  {
+    using __completions = completion_signatures_of_t<__sndr_t<_Sndr, _Pred, _Then, _Else>>;
+    static_assert(__is_completion_signatures<__completions>);
+  }
+
+  return __sndr_t<_Sndr, _Pred, _Then, _Else>{
+    {},
+    {static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)},
+    static_cast<_Sndr&&>(__sndr)};
+}
+
+template <class _Pred, class _Then, class _Else>
+template <class _Sndr>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __cond_t::__closure<_Pred, _Then, _Else>::__mk_sender(_Sndr&& __sndr) //
+  -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+{
+  if constexpr (__is_non_dependent_sender<_Sndr>)
+  {
+    using __completions = completion_signatures_of_t<__sndr_t<_Sndr, _Pred, _Then, _Else>>;
+    static_assert(__is_completion_signatures<__completions>);
+  }
+
+  return __sndr_t<_Sndr, _Pred, _Then, _Else>{
+    {}, static_cast<__cond_t::__data<_Pred, _Then, _Else>&&>(__data_), static_cast<_Sndr&&>(__sndr)};
+}
+
+_CCCL_GLOBAL_CONSTANT __cond_t conditional{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/config.cuh b/cudax/include/cuda/experimental/__async/config.cuh
new file mode 100644
index 0000000000..06cb16cca8
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/config.cuh
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CONFIG
+#define __CUDAX_ASYNC_DETAIL_CONFIG
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+namespace cuda::experimental::__async
+{
+}
+
+// Debuggers do not step into functions marked with __attribute__((__artificial__)).
+// This is useful for small wrapper functions that just dispatch to other functions and
+// that are inlined into the caller.
+#if __has_attribute(__artificial__) && !defined(__CUDACC__)
+#  define _CUDAX_ARTIFICIAL __attribute__((__artificial__))
+#else
+#  define _CUDAX_ARTIFICIAL
+#endif
+
+#define _CUDAX_ALWAYS_INLINE _CCCL_ALWAYS_INLINE _CUDAX_ARTIFICIAL _LIBCUDACXX_NODEBUG inline
+
+// GCC struggles with guaranteed copy elision of immovable types.
+#if defined(_CCCL_COMPILER_GCC)
+#  define _CUDAX_IMMOVABLE(_XP) _XP(_XP&&)
+#else
+#  define _CUDAX_IMMOVABLE(_XP) _XP(_XP&&) = delete
+#endif
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/continue_on.cuh b/cudax/include/cuda/experimental/__async/continue_on.cuh
new file mode 100644
index 0000000000..4f8cabbd97
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/continue_on.cuh
@@ -0,0 +1,288 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CONTINUE_ON
+#define __CUDAX_ASYNC_DETAIL_CONTINUE_ON
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct continue_on_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  template <class... _As>
+  using __set_value_tuple_t = __tuple<set_value_t, __decay_t<_As>...>;
+
+  template <class _Error>
+  using __set_error_tuple_t = __tuple<set_error_t, __decay_t<_Error>>;
+
+  using __set_stopped_tuple_t = __tuple<set_stopped_t>;
+
+  using __complete_fn = void (*)(void*) noexcept;
+
+  template <class... _Ts>
+  using __set_value_completion =
+    _CUDA_VSTD::_If<__nothrow_decay_copyable<_Ts...>,
+                    completion_signatures<set_value_t(__decay_t<_Ts>...)>,
+                    completion_signatures<set_value_t(__decay_t<_Ts>...), set_error_t(::std::exception_ptr)>>;
+
+  template <class _Error>
+  using __set_error_completion =
+    _CUDA_VSTD::_If<__nothrow_decay_copyable<_Error>,
+                    completion_signatures<set_error_t(__decay_t<_Error>)>,
+                    completion_signatures<set_error_t(__decay_t<_Error>), set_error_t(::std::exception_ptr)>>;
+
+  template <class _Rcvr, class _Result>
+  struct __rcvr_t
+  {
+    using receiver_concept = receiver_t;
+    _Rcvr __rcvr_;
+    _Result __result_;
+    __complete_fn __complete_;
+
+    template <class _Tag, class... _As>
+    _CCCL_HOST_DEVICE void operator()(_Tag, _As&... __as) noexcept
+    {
+      _Tag()(static_cast<_Rcvr&&>(__rcvr_), static_cast<_As&&>(__as)...);
+    }
+
+    template <class _Tag, class... _As>
+    _CCCL_HOST_DEVICE void __set_result(_Tag, _As&&... __as) noexcept
+    {
+      using __tupl_t = __tuple<_Tag, __decay_t<_As>...>;
+      if constexpr (__nothrow_decay_copyable<_As...>)
+      {
+        __result_.template __emplace<__tupl_t>(_Tag(), static_cast<_As&&>(__as)...);
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __result_.template __emplace<__tupl_t>(_Tag(), static_cast<_As&&>(__as)...);
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+      __complete_ = +[](void* __ptr) noexcept {
+        auto& __self = *static_cast<__rcvr_t*>(__ptr);
+        auto& __tupl = *static_cast<__tupl_t*>(__self.__result_.__ptr());
+        __tupl.__apply(__self, __tupl);
+      };
+    }
+
+    _CCCL_HOST_DEVICE void set_value() noexcept
+    {
+      __complete_(this);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __async::set_error(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __async::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
+    }
+
+    _CCCL_HOST_DEVICE env_of_t<_Rcvr> get_env() const noexcept
+    {
+      return __async::get_env(__rcvr_);
+    }
+  };
+
+  template <class _Rcvr, class _CvSndr, class _Sch>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend auto get_env(const __opstate_t* __self) noexcept -> env_of_t<_Rcvr>
+    {
+      return __async::get_env(__self->__rcvr_.__rcvr);
+    }
+
+    using operation_state_concept = operation_state_t;
+    using __result_t =
+      __transform_completion_signatures<completion_signatures_of_t<_CvSndr, __opstate_t*>,
+                                        __set_value_tuple_t,
+                                        __set_error_tuple_t,
+                                        __set_stopped_tuple_t,
+                                        __variant>;
+
+    // The scheduler contributes error and stopped completions.
+    // This causes its set_value_t() completion to be ignored.
+    using __scheduler_completions = //
+      transform_completion_signatures<completion_signatures_of_t<schedule_result_t<_Sch>, __rcvr_t<_Rcvr, __result_t>*>,
+                                      __async::completion_signatures<>,
+                                      __malways<__async::completion_signatures<>>::__f>;
+
+    // The continue_on completions are the scheduler's error
+    // and stopped completions, plus the sender's completions
+    // with all the result data types decayed.
+    using completion_signatures = //
+      transform_completion_signatures<completion_signatures_of_t<_CvSndr, __opstate_t*>,
+                                      __scheduler_completions,
+                                      __set_value_completion,
+                                      __set_error_completion>;
+
+    __rcvr_t<_Rcvr, __result_t> __rcvr_;
+    connect_result_t<_CvSndr, __opstate_t*> __opstate1_;
+    connect_result_t<schedule_result_t<_Sch>, __rcvr_t<_Rcvr, __result_t>*> __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_CvSndr&& __sndr, _Sch __sch, _Rcvr __rcvr)
+        : __rcvr_{static_cast<_Rcvr&&>(__rcvr), {}, nullptr}
+        , __opstate1_{__async::connect(static_cast<_CvSndr&&>(__sndr), this)}
+        , __opstate2_{__async::connect(schedule(__sch), &__rcvr_)}
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    template <class... _As>
+    _CCCL_HOST_DEVICE void set_value(_As&&... __as) noexcept
+    {
+      __rcvr_.__set_result(set_value_t(), static_cast<_As&&>(__as)...);
+      __async::start(__opstate2_);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __rcvr_.__set_result(set_error_t(), static_cast<_Error&&>(__error));
+      __async::start(__opstate2_);
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __rcvr_.__set_result(set_stopped_t());
+      __async::start(__opstate2_);
+    }
+  };
+
+  template <class _Sndr, class _Sch>
+  struct __sndr_t;
+
+  template <class _Sch>
+  struct __closure_t;
+
+public:
+  template <class _Sndr, class _Sch>
+  _CCCL_HOST_DEVICE __sndr_t<_Sndr, _Sch> operator()(_Sndr __sndr, _Sch __sch) const noexcept;
+
+  template <class _Sch>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE __closure_t<_Sch> operator()(_Sch __sch) const noexcept;
+};
+
+template <class _Sch>
+struct continue_on_t::__closure_t
+{
+  _Sch __sch;
+
+  template <class _Sndr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, __closure_t&& __self)
+  {
+    return continue_on_t()(static_cast<_Sndr&&>(__sndr), static_cast<_Sch&&>(__self.__sch));
+  }
+};
+
+template <class _Sndr, class _Sch>
+struct continue_on_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS continue_on_t __tag;
+  _Sch __sch;
+  _Sndr __sndr;
+
+  struct __attrs_t
+  {
+    __sndr_t* __sndr;
+
+    template <class _SetTag>
+    _CCCL_HOST_DEVICE auto query(get_completion_scheduler_t<_SetTag>) const noexcept
+    {
+      return __sndr->__sch;
+    }
+
+    template <class _Query>
+    _CCCL_HOST_DEVICE auto query(_Query) const //
+      -> __query_result_t<_Query, env_of_t<_Sndr>>
+    {
+      return __async::get_env(__sndr->__sndr).__query(_Query{});
+    }
+  };
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE __opstate_t<_Rcvr, _Sndr, _Sch> connect(_Rcvr __rcvr) &&
+  {
+    return {static_cast<_Sndr&&>(__sndr), __sch, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE __opstate_t<_Rcvr, const _Sndr&, _Sch> connect(_Rcvr __rcvr) const&
+  {
+    return {__sndr, __sch, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  _CCCL_HOST_DEVICE __attrs_t get_env() const noexcept
+  {
+    return __attrs_t{this};
+  }
+};
+
+template <class _Sndr, class _Sch>
+_CCCL_HOST_DEVICE auto
+continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept -> continue_on_t::__sndr_t<_Sndr, _Sch>
+{
+  return __sndr_t<_Sndr, _Sch>{{}, __sch, static_cast<_Sndr&&>(__sndr)};
+}
+
+template <class _Sch>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE continue_on_t::__closure_t<_Sch>
+continue_on_t::operator()(_Sch __sch) const noexcept
+{
+  return __closure_t<_Sch>{__sch};
+}
+
+_CCCL_GLOBAL_CONSTANT continue_on_t continue_on{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/cpos.cuh b/cudax/include/cuda/experimental/__async/cpos.cuh
new file mode 100644
index 0000000000..a20f96e25b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/cpos.cuh
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CPOS
+#define __CUDAX_ASYNC_DETAIL_CPOS
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct receiver_t
+{};
+
+struct operation_state_t
+{};
+
+struct sender_t
+{};
+
+struct scheduler_t
+{};
+
+template <class _Ty>
+using __sender_concept_t = typename __remove_ref_t<_Ty>::sender_concept;
+
+template <class _Ty>
+using __receiver_concept_t = typename __remove_ref_t<_Ty>::receiver_concept;
+
+template <class _Ty>
+using __scheduler_concept_t = typename __remove_ref_t<_Ty>::scheduler_concept;
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_sender = __mvalid_q<__sender_concept_t, _Ty>;
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_receiver = __mvalid_q<__receiver_concept_t, _Ty>;
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_scheduler = __mvalid_q<__scheduler_concept_t, _Ty>;
+
+_CCCL_GLOBAL_CONSTANT struct set_value_t
+{
+  template <class _Rcvr, class... _Ts>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr&& __rcvr, _Ts&&... __ts) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...)), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...)));
+    static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...);
+  }
+
+  template <class _Rcvr, class... _Ts>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr* __rcvr, _Ts&&... __ts) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...)), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...)));
+    static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...);
+  }
+} set_value{};
+
+_CCCL_GLOBAL_CONSTANT struct set_error_t
+{
+  template <class _Rcvr, class _Ey>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr&& __rcvr, _Ey&& __e) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e)))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e))), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e))));
+    static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e));
+  }
+
+  template <class _Rcvr, class _Ey>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr* __rcvr, _Ey&& __e) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e)))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e))), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e))));
+    static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e));
+  }
+} set_error{};
+
+_CCCL_GLOBAL_CONSTANT struct set_stopped_t
+{
+  template <class _Rcvr>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_Rcvr&& __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped())
+  {
+    static_assert(_CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped()));
+    static_cast<_Rcvr&&>(__rcvr).set_stopped();
+  }
+
+  template <class _Rcvr>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_Rcvr* __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped())
+  {
+    static_assert(_CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_stopped()));
+    static_cast<_Rcvr&&>(*__rcvr).set_stopped();
+  }
+} set_stopped{};
+
+_CCCL_GLOBAL_CONSTANT struct start_t
+{
+  template <class _OpState>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_OpState& __opstate) const noexcept -> decltype(__opstate.start())
+  {
+    static_assert(!__is_error<typename _OpState::completion_signatures>);
+    static_assert(_CUDA_VSTD::is_same_v<decltype(__opstate.start()), void>);
+    static_assert(noexcept(__opstate.start()));
+    __opstate.start();
+  }
+} start{};
+
+_CCCL_GLOBAL_CONSTANT struct connect_t
+{
+  template <class _Sndr, class _Rcvr>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Sndr&& __sndr, _Rcvr&& __rcvr) const
+    noexcept(noexcept(static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr))))
+      -> decltype(static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr)))
+  {
+    // using __opstate_t     = decltype(static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr)));
+    // using completions_t = typename __opstate_t::completion_signatures;
+    // static_assert(__is_completion_signatures<completions_t>);
+
+    return static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr));
+  }
+} connect{};
+
+_CCCL_GLOBAL_CONSTANT struct schedule_t
+{
+  template <class _Sch>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_Sch&& __sch) const noexcept -> decltype(static_cast<_Sch&&>(__sch).schedule())
+  {
+    static_assert(noexcept(static_cast<_Sch&&>(__sch).schedule()));
+    return static_cast<_Sch&&>(__sch).schedule();
+  }
+} schedule{};
+
+struct receiver_archetype
+{
+  using receiver_concept = receiver_t;
+
+  template <class... _Ts>
+  void set_value(_Ts&&...) noexcept;
+
+  template <class _Error>
+  void set_error(_Error&&) noexcept;
+
+  void set_stopped() noexcept;
+
+  env<> get_env() const noexcept;
+};
+
+template <class _Sndr, class _Rcvr>
+using connect_result_t = decltype(connect(__declval<_Sndr>(), __declval<_Rcvr>()));
+
+template <class _Sndr, class _Rcvr = receiver_archetype>
+using completion_signatures_of_t = typename connect_result_t<_Sndr, _Rcvr>::completion_signatures;
+
+template <class _Sch>
+using schedule_result_t = decltype(schedule(__declval<_Sch>()));
+
+template <class _Sndr, class _Rcvr>
+_CCCL_INLINE_VAR constexpr bool __nothrow_connectable = noexcept(connect(__declval<_Sndr>(), __declval<_Rcvr>()));
+
+// handy enumerations for keeping type names readable
+enum __disposition_t
+{
+  __value,
+  __error,
+  __stopped
+};
+
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __set_tag;
+template <class _Void>
+extern __fn_t<set_value_t>* __set_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<set_error_t>* __set_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<set_stopped_t>* __set_tag<__stopped, _Void>;
+} // namespace __detail
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/env.cuh b/cudax/include/cuda/experimental/__async/env.cuh
new file mode 100644
index 0000000000..8b5479e72a
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/env.cuh
@@ -0,0 +1,192 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_ENV
+#define __CUDAX_ASYNC_DETAIL_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__functional/reference_wrapper.h>
+
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <functional>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+// warning #20012-D: __device__ annotation is ignored on a
+// function("inplace_stop_source") that is explicitly defaulted on its first
+// declaration
+_CCCL_NV_DIAG_SUPPRESS(20012)
+
+namespace cuda::experimental::__async
+{
+template <class _Ty>
+extern _Ty __unwrap_ref;
+
+template <class _Ty>
+extern _Ty& __unwrap_ref<::std::reference_wrapper<_Ty>>;
+
+template <class _Ty>
+extern _Ty& __unwrap_ref<_CUDA_VSTD::reference_wrapper<_Ty>>;
+
+template <class _Ty>
+using __unwrap_reference_t = decltype(__unwrap_ref<_Ty>);
+
+template <class _Query, class _Value>
+struct prop
+{
+  _CCCL_NO_UNIQUE_ADDRESS _Query __query;
+  _CCCL_NO_UNIQUE_ADDRESS _Value __value;
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query) const noexcept -> const _Value&
+  {
+    return __value;
+  }
+};
+
+template <class... _Envs>
+struct env
+{
+  __tuple<_Envs...> __envs_;
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    constexpr bool __flags[] = {__queryable<_Envs, _Query>..., false};
+    constexpr size_t __idx   = __async::__find_pos(__flags, __flags + sizeof...(_Envs));
+    if constexpr (__idx != __npos)
+    {
+      return __async::__cget<__idx>(__envs_);
+    }
+  }
+
+  template <class _Query, class _Env = env>
+  using __1st_env_t = decltype(__declval<const _Env&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<__1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<__1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+};
+
+// partial specialization for two environments
+template <class _Env0, class _Env1>
+struct env<_Env0, _Env1>
+{
+  _CCCL_NO_UNIQUE_ADDRESS _Env0 __env0_;
+  _CCCL_NO_UNIQUE_ADDRESS _Env1 __env1_;
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    if constexpr (__queryable<_Env0, _Query>)
+    {
+      return (__env0_);
+    }
+    else if constexpr (__queryable<_Env1, _Query>)
+    {
+      return (__env1_);
+    }
+  }
+
+  template <class _Query, class _Env = env>
+  using __1st_env_t = decltype(__declval<const _Env&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<__1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<__1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+};
+
+template <class... _Envs>
+_CCCL_HOST_DEVICE env(_Envs...) -> env<__unwrap_reference_t<_Envs>...>;
+
+using empty_env = env<>;
+
+namespace __adl
+{
+template <class _Ty>
+_CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto get_env(_Ty* __ty) noexcept //
+  -> decltype(__ty->get_env())
+{
+  static_assert(noexcept(__ty->get_env()));
+  return __ty->get_env();
+}
+
+struct __get_env_t
+{
+  template <class _Ty>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ty* __ty) const noexcept //
+    -> decltype(get_env(__ty))
+  {
+    static_assert(noexcept(get_env(__ty)));
+    return get_env(__ty);
+  }
+};
+} // namespace __adl
+
+struct get_env_t
+{
+  template <class _Ty>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ty&& __ty) const noexcept //
+    -> decltype(__ty.get_env())
+  {
+    static_assert(noexcept(__ty.get_env()));
+    return __ty.get_env();
+  }
+
+  template <class _Ty>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ty* __ty) const noexcept //
+    -> __call_result_t<__adl::__get_env_t, _Ty*>
+  {
+    return __adl::__get_env_t()(__ty);
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE empty_env operator()(__ignore) const noexcept
+  {
+    return {};
+  }
+};
+
+namespace __region
+{
+_CCCL_GLOBAL_CONSTANT get_env_t get_env{};
+} // namespace __region
+
+using namespace __region;
+
+template <class _Ty>
+using env_of_t = decltype(get_env(__declval<_Ty>()));
+} // namespace cuda::experimental::__async
+
+_CCCL_NV_DIAG_DEFAULT(20012)
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/epilogue.cuh b/cudax/include/cuda/experimental/__async/epilogue.cuh
new file mode 100644
index 0000000000..16438ed81f
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/epilogue.cuh
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#if !defined(_CUDAX_ASYNC_PROLOGUE_INCLUDED)
+#  __error epilogue.cuh included without a prior inclusion of prologue.cuh
+#endif
+
+#undef _CUDAX_ASYNC_PROLOGUE_INCLUDED
+
+_CCCL_DIAG_POP
diff --git a/cudax/include/cuda/experimental/__async/exception.cuh b/cudax/include/cuda/experimental/__async/exception.cuh
new file mode 100644
index 0000000000..f1b86d2328
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/exception.cuh
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_EXCEPTION
+#define __CUDAX_ASYNC_DETAIL_EXCEPTION
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+#include <exception> // IWYU pragma: keep export
+
+#if defined(__CUDACC__)
+#  include <nv/target>
+#  define _CUDAX_CATCH(...)
+#  define _CUDAX_TRY(_TRY, _CATCH) \
+    NV_IF_TARGET(NV_IS_HOST, (try { _NV_EVAL _TRY } catch (...){_NV_EVAL _CATCH}), ({_NV_EVAL _TRY}))
+#else
+#  define _CUDAX_CATCH(...)
+#  define _CUDAX_TRY(_TRY, _CATCH) _NV_EVAL(try { _NV_EVAL _TRY } catch (...){_NV_EVAL _CATCH})
+#endif
+
+#if defined(__CUDA_ARCH__)
+// Treat everything as no-throw in device code
+#  define _CUDAX_NOEXCEPT_EXPR(...) true
+#else
+// This is the default behavior for host code, and for nvc++
+#  define _CUDAX_NOEXCEPT_EXPR(...) noexcept(__VA_ARGS__)
+#endif
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/fwd_rcvr.cuh b/cudax/include/cuda/experimental/__async/fwd_rcvr.cuh
new file mode 100644
index 0000000000..1c6c2f684d
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/fwd_rcvr.cuh
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_FWD_RCVR
+#define __CUDAX_ASYNC_DETAIL_FWD_RCVR
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Rcvr>
+struct __fwd_rcvr : _Rcvr
+{
+  _CCCL_HOST_DEVICE decltype(auto) get_env() const noexcept
+  {
+    // TODO: only forward the "forwarding" queries:
+    return __async::get_env(static_cast<_Rcvr const&>(*this));
+  }
+};
+
+template <class _Rcvr>
+struct __fwd_rcvr<_Rcvr*>
+{
+  using receiver_concept = receiver_t;
+  _Rcvr* __rcvr_;
+
+  template <class... _As>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_As&&... __as) noexcept
+  {
+    __async::set_value(__rcvr_);
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+  {
+    __async::set_error(__rcvr_, static_cast<_Error&&>(__error));
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept
+  {
+    __async::set_stopped(__rcvr_);
+  }
+
+  _CCCL_HOST_DEVICE decltype(auto) get_env() const noexcept
+  {
+    // TODO: only forward the "forwarding" queries:
+    return __async::get_env(__rcvr_);
+  }
+};
+} // namespace cuda::experimental::__async
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/just.cuh b/cudax/include/cuda/experimental/__async/just.cuh
new file mode 100644
index 0000000000..fe8be694db
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/just.cuh
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_JUST
+#define __CUDAX_ASYNC_DETAIL_JUST
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Forward declarations of the just* tag types:
+struct just_t;
+struct just_error_t;
+struct just_stopped_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __just_tag;
+template <class _Void>
+extern __fn_t<just_t>* __just_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<just_error_t>* __just_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<just_stopped_t>* __just_tag<__stopped, _Void>;
+} // namespace __detail
+
+template <__disposition_t _Disposition>
+struct __just
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+
+  using _JustTag = decltype(__detail::__just_tag<_Disposition>());
+  using _SetTag  = decltype(__detail::__set_tag<_Disposition>());
+
+  template <class _Rcvr, class... _Ts>
+  struct __opstate_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __async::completion_signatures<_SetTag(_Ts...)>;
+    _Rcvr __rcvr_;
+    __tuple<_Ts...> __values_;
+
+    struct __complete_fn
+    {
+      __opstate_t* __self_;
+
+      _CCCL_HOST_DEVICE void operator()(_Ts&... __ts) const noexcept
+      {
+        _SetTag()(static_cast<_Rcvr&&>(__self_->__rcvr_), static_cast<_Ts&&>(__ts)...);
+      }
+    };
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      __values_.__apply(__complete_fn{this}, __values_);
+    }
+  };
+
+  template <class... _Ts>
+  struct __sndr_t
+  {
+    using sender_concept        = sender_t;
+    using completion_signatures = __async::completion_signatures<_SetTag(_Ts...)>;
+
+    _CCCL_NO_UNIQUE_ADDRESS _JustTag __tag_;
+    __tuple<_Ts...> __values_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate_t<_Rcvr, _Ts...> connect(_Rcvr __rcvr) && //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Ts...>)
+    {
+      return __opstate_t<_Rcvr, _Ts...>{static_cast<_Rcvr&&>(__rcvr), static_cast<__tuple<_Ts...>&&>(__values_)};
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate_t<_Rcvr, _Ts...> connect(_Rcvr __rcvr) const& //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Ts const&...>)
+    {
+      return __opstate_t<_Rcvr, _Ts...>{static_cast<_Rcvr&&>(__rcvr), __values_};
+    }
+  };
+
+public:
+  template <class... _Ts>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ts... __ts) const noexcept
+  {
+    return __sndr_t<_Ts...>{_JustTag{}, {{static_cast<_Ts&&>(__ts)}...}};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct just_t : __just<__value>
+{
+} just{};
+
+_CCCL_GLOBAL_CONSTANT struct just_error_t : __just<__error>
+{
+} just_error{};
+
+_CCCL_GLOBAL_CONSTANT struct just_stopped_t : __just<__stopped>
+{
+} just_stopped{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/just_from.cuh b/cudax/include/cuda/experimental/__async/just_from.cuh
new file mode 100644
index 0000000000..2df102ffa3
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/just_from.cuh
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_JUST_FROM
+#define __CUDAX_ASYNC_DETAIL_JUST_FROM
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/rcvr_ref.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Forward declarations of the just* tag types:
+struct just_from_t;
+struct just_error_from_t;
+struct just_stopped_from_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __just_from_tag;
+template <class _Void>
+extern __fn_t<just_from_t>* __just_from_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<just_error_from_t>* __just_from_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<just_stopped_from_t>* __just_from_tag<__stopped, _Void>;
+} // namespace __detail
+
+struct _AN_ERROR_COMPLETION_MUST_HAVE_EXACTLY_ONE_ERROR_ARGUMENT;
+struct _A_STOPPED_COMPLETION_MUST_HAVE_NO_ARGUMENTS;
+
+template <__disposition_t _Disposition>
+struct __just_from
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+
+  using _JustTag = decltype(__detail::__just_from_tag<_Disposition>());
+  using _SetTag  = decltype(__detail::__set_tag<_Disposition>());
+
+  using __diag_t = _CUDA_VSTD::_If<_CUDA_VSTD::is_same_v<_SetTag, set_error_t>,
+                                   _AN_ERROR_COMPLETION_MUST_HAVE_EXACTLY_ONE_ERROR_ARGUMENT,
+                                   _A_STOPPED_COMPLETION_MUST_HAVE_NO_ARGUMENTS>;
+
+  template <class... _Ts>
+  using __error_t =
+    _ERROR<_WHERE(_IN_ALGORITHM, _JustTag), _WHAT(__diag_t), _WITH_COMPLETION_SIGNATURE<_SetTag(_Ts...)>>;
+
+  struct __probe_fn
+  {
+    template <class... _Ts>
+    auto operator()(_Ts&&... __ts) const noexcept
+      -> _CUDA_VSTD::
+        _If<__is_valid_signature<_SetTag(_Ts...)>, completion_signatures<_SetTag(_Ts...)>, __error_t<_Ts...>>;
+  };
+
+  template <class _Rcvr = receiver_archetype>
+  struct __complete_fn
+  {
+    _Rcvr& __rcvr_;
+
+    template <class... _Ts>
+    _CCCL_HOST_DEVICE auto operator()(_Ts&&... __ts) const noexcept
+    {
+      _SetTag()(static_cast<_Rcvr&>(__rcvr_), static_cast<_Ts&&>(__ts)...);
+    }
+  };
+
+  template <class _Rcvr, class _Fn>
+  struct __opstate
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __call_result_t<_Fn, __probe_fn>;
+    static_assert(__is_completion_signatures<completion_signatures>);
+
+    _Rcvr __rcvr_;
+    _Fn __fn_;
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      static_cast<_Fn&&>(__fn_)(__complete_fn<_Rcvr>{__rcvr_});
+    }
+  };
+
+  template <class _Fn>
+  struct __sndr_t
+  {
+    using sender_concept = sender_t;
+
+    _CCCL_NO_UNIQUE_ADDRESS _JustTag __tag_;
+    _Fn __fn_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate<_Rcvr, _Fn> connect(_Rcvr __rcvr) && //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Fn>)
+    {
+      return __opstate<_Rcvr, _Fn>{static_cast<_Rcvr&&>(__rcvr), static_cast<_Fn&&>(__fn_)};
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate<_Rcvr, _Fn> connect(_Rcvr __rcvr) const& //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Fn const&>)
+    {
+      return __opstate<_Rcvr, _Fn>{static_cast<_Rcvr&&>(__rcvr), __fn_};
+    }
+  };
+
+public:
+  template <class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Fn __fn) const noexcept
+  {
+    using __completions = __call_result_t<_Fn, __probe_fn>;
+    static_assert(__is_completion_signatures<__completions>,
+                  "The function passed to just_from must return an instance of a specialization of "
+                  "completion_signatures<>.");
+    return __sndr_t<_Fn>{{}, static_cast<_Fn&&>(__fn)};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct just_from_t : __just_from<__value>
+{
+} just_from{};
+
+_CCCL_GLOBAL_CONSTANT struct just_error_from_t : __just_from<__error>
+{
+} just_error_from{};
+
+_CCCL_GLOBAL_CONSTANT struct just_stopped_from_t : __just_from<__stopped>
+{
+} just_stopped_from{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/lazy.cuh b/cudax/include/cuda/experimental/__async/lazy.cuh
new file mode 100644
index 0000000000..0904dcdc50
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/lazy.cuh
@@ -0,0 +1,158 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_LAZY
+#define __CUDAX_ASYNC_DETAIL_LAZY
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/addressof.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__new/launder.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+
+#include <new> // IWYU pragma: keep
+
+namespace cuda::experimental::__async
+{
+/// @brief A lazy type that can be used to delay the construction of a type.
+template <class _Ty>
+struct __lazy
+{
+  _CCCL_HOST_DEVICE __lazy() noexcept {}
+
+  _CCCL_HOST_DEVICE ~__lazy() {}
+
+  template <class... _Ts>
+  _CCCL_HOST_DEVICE _Ty& construct(_Ts&&... __ts) noexcept(__nothrow_constructible<_Ty, _Ts...>)
+  {
+    _Ty* __value_ = ::new (static_cast<void*>(_CUDA_VSTD::addressof(__value_))) _Ty{static_cast<_Ts&&>(__ts)...};
+    return *_CUDA_VSTD::launder(__value_);
+  }
+
+  template <class _Fn, class... _Ts>
+  _CCCL_HOST_DEVICE _Ty& construct_from(_Fn&& __fn, _Ts&&... __ts) noexcept(__nothrow_callable<_Fn, _Ts...>)
+  {
+    _Ty* __value_ = ::new (static_cast<void*>(_CUDA_VSTD::addressof(__value_)))
+      _Ty{static_cast<_Fn&&>(__fn)(static_cast<_Ts&&>(__ts)...)};
+    return *_CUDA_VSTD::launder(__value_);
+  }
+
+  _CCCL_HOST_DEVICE void destroy() noexcept
+  {
+    _CUDA_VSTD::destroy_at(&__value_);
+  }
+
+  union
+  {
+    _Ty __value_;
+  };
+};
+
+namespace __detail
+{
+template <size_t _Idx, size_t _Size, size_t _Align>
+struct __lazy_box_
+{
+  static_assert(_Size != 0);
+  alignas(_Align) unsigned char __data_[_Size];
+};
+
+template <size_t _Idx, class _Ty>
+using __lazy_box = __lazy_box_<_Idx, sizeof(_Ty), alignof(_Ty)>;
+} // namespace __detail
+
+template <class _Idx, class... _Ts>
+struct __lazy_tupl;
+
+template <>
+struct __lazy_tupl<__mindices<>>
+{
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __apply(_Fn&& __fn, _Self&&, _Us&&... __us) //
+    noexcept(__nothrow_callable<_Fn, _Us...>) -> __call_result_t<_Fn, _Us...>
+  {
+    return static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...);
+  }
+};
+
+template <size_t... _Idx, class... _Ts>
+struct __lazy_tupl<__mindices<_Idx...>, _Ts...> : __detail::__lazy_box<_Idx, _Ts>...
+{
+  template <size_t _Ny>
+  using __at = __m_at_c<_Ny, _Ts...>;
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE __lazy_tupl() noexcept {}
+
+  _CCCL_HOST_DEVICE ~__lazy_tupl()
+  {
+    ((__engaged_[_Idx] ? _CUDA_VSTD::destroy_at(__get<_Idx, _Ts>()) : void(0)), ...);
+  }
+
+  template <size_t _Ny, class _Ty>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE _Ty* __get() noexcept
+  {
+    return reinterpret_cast<_Ty*>(this->__detail::__lazy_box<_Ny, _Ty>::__data_);
+  }
+
+  template <size_t _Ny, class... _Us>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE __at<_Ny>& __emplace(_Us&&... __us) //
+    noexcept(__nothrow_constructible<__at<_Ny>, _Us...>)
+  {
+    using _Ty       = __at<_Ny>;
+    _Ty* __value_   = ::new (static_cast<void*>(__get<_Ny, _Ty>())) _Ty{static_cast<_Us&&>(__us)...};
+    __engaged_[_Ny] = true;
+    return *_CUDA_VSTD::launder(__value_);
+  }
+
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
+    noexcept(__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>...>)
+      -> __call_result_t<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>...>
+  {
+    return static_cast<_Fn&&>(__fn)(
+      static_cast<_Us&&>(__us)..., static_cast<__copy_cvref_t<_Self, _Ts>&&>(*__self.template __get<_Idx, _Ts>())...);
+  }
+
+  bool __engaged_[sizeof...(_Ts)] = {};
+};
+
+#if defined(_CCCL_COMPILER_MSVC)
+template <class... _Ts>
+struct __mk_lazy_tuple_
+{
+  using __indices_t = __mmake_indices<sizeof...(_Ts)>;
+  using type        = __lazy_tupl<__indices_t, _Ts...>;
+};
+
+template <class... _Ts>
+using __lazy_tuple = __t<__mk_lazy_tuple_<_Ts...>>;
+#else
+template <class... _Ts>
+using __lazy_tuple = __lazy_tupl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+#endif
+
+template <class... _Ts>
+using __decayed_lazy_tuple = __lazy_tuple<__decay_t<_Ts>...>;
+
+} // namespace cuda::experimental::__async
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/let_value.cuh b/cudax/include/cuda/experimental/__async/let_value.cuh
new file mode 100644
index 0000000000..fb0c54cc4e
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/let_value.cuh
@@ -0,0 +1,326 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_LET_VALUE
+#define __CUDAX_ASYNC_DETAIL_LET_VALUE
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/rcvr_ref.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Declare types to use for diagnostics:
+struct _FUNCTION_MUST_RETURN_A_SENDER;
+
+// Forward-declate the let_* algorithm tag types:
+struct let_value_t;
+struct let_error_t;
+struct let_stopped_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __let_tag;
+template <class _Void>
+extern __fn_t<let_value_t>* __let_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<let_error_t>* __let_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<let_stopped_t>* __let_tag<__stopped, _Void>;
+} // namespace __detail
+
+template <__disposition_t _Disposition>
+struct __let
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  using _LetTag = decltype(__detail::__let_tag<_Disposition>());
+  using _SetTag = decltype(__detail::__set_tag<_Disposition>());
+
+  template <class...>
+  using __empty_tuple = __tuple<>;
+
+  /// @brief Computes the type of a variant of tuples to hold the results of
+  /// the predecessor sender.
+  template <class _CvSndr, class _Rcvr>
+  using __results =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __decayed_tuple,
+                                   __empty_tuple,
+                                   __variant>;
+
+  template <class _Fn, class _Rcvr>
+  struct __opstate_fn
+  {
+    template <class... _As>
+    using __f = connect_result_t<__call_result_t<_Fn, __decay_t<_As>&...>, __rcvr_ref_t<_Rcvr&>>;
+  };
+
+  /// @brief Computes the type of a variant of operation states to hold
+  /// the second operation state.
+  template <class _CvSndr, class _Fn, class _Rcvr>
+  using __opstate2_t =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __opstate_fn<_Fn, _Rcvr>::template __f,
+                                   __empty_tuple,
+                                   __variant>;
+
+  template <class _Fn, class _Rcvr>
+  struct __completions_fn
+  {
+    using __error_non_sender_return = //
+      _ERROR<_WHERE(_IN_ALGORITHM, _LetTag), _WHAT(_FUNCTION_MUST_RETURN_A_SENDER), _WITH_FUNCTION(_Fn)>;
+
+    template <class _Ty>
+    using __ensure_sender = //
+      _CUDA_VSTD::_If<__is_sender<_Ty> || __is_error<_Ty>, _Ty, __error_non_sender_return>;
+
+    template <class... _As>
+    using __error_not_callable_with = //
+      _ERROR<_WHERE(_IN_ALGORITHM, _LetTag),
+             _WHAT(_FUNCTION_IS_NOT_CALLABLE),
+             _WITH_FUNCTION(_Fn),
+             _WITH_ARGUMENTS(_As...)>;
+
+    // This computes the result of calling the function with the
+    // predecessor sender's results. If the function is not callable with
+    // the results, it returns an _ERROR.
+    template <class... _As>
+    using __call_result =
+      __minvoke<__mtry_quote<__call_result_t, __error_not_callable_with<_As...>>, _Fn, __decay_t<_As>&...>;
+
+    // This computes the completion signatures of sender returned by the
+    // function when called with the given arguments. It returns an _ERROR if
+    // the function is not callable with the arguments or if the function
+    // returns a non-sender.
+    template <class... _As>
+    using __f =
+      __mtry_invoke_q<completion_signatures_of_t, __ensure_sender<__call_result<_As...>>, __rcvr_ref_t<_Rcvr&>>;
+  };
+
+  /// @brief Computes the completion signatures of the
+  /// `let_(value|error|stopped)` sender.
+  template <class _CvSndr, class _Fn, class _Rcvr>
+  using __completions =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __completions_fn<_Fn, _Rcvr>::template __f,
+                                   __default_completions,
+                                   __mbind_front<__mtry_quote<__concat_completion_signatures>, __eptr_completion>::__f>;
+
+  /// @brief The `let_(value|error|stopped)` operation state.
+  /// @tparam _CvSndr The cvref-qualified predecessor sender type.
+  /// @tparam _Fn The function to be called when the predecessor sender
+  /// completes.
+  /// @tparam _Rcvr The receiver connected to the `let_(value|error|stopped)`
+  /// sender.
+  template <class _Rcvr, class _CvSndr, class _Fn>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
+    {
+      return __async::get_env(__self->__rcvr_);
+    }
+
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __completions<_CvSndr, _Fn, _Rcvr>;
+
+    // Don't try to compute the type of the variant of operation states
+    // if the computation of the completion signatures failed.
+    using __deferred_opstate_fn = __mbind_back<__mtry_quote<__opstate2_t>, _CvSndr, _Fn, _Rcvr>;
+    using __opstate_variant_fn =
+      _CUDA_VSTD::_If<__is_error<completion_signatures>, __malways<__empty>, __deferred_opstate_fn>;
+    using __opstate_variant_t = __mtry_invoke<__opstate_variant_fn>;
+
+    _Rcvr __rcvr_;
+    _Fn __fn_;
+    __results<_CvSndr, __opstate_t*> __result_;
+    connect_result_t<_CvSndr, __opstate_t*> __opstate1_;
+    __opstate_variant_t __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_CvSndr&& __sndr, _Fn __fn, _Rcvr __rcvr) noexcept(
+      __nothrow_decay_copyable<_Fn, _Rcvr> && __nothrow_connectable<_CvSndr, __opstate_t*>)
+        : __rcvr_(static_cast<_Rcvr&&>(__rcvr))
+        , __fn_(static_cast<_Fn&&>(__fn))
+        , __opstate1_(__async::connect(static_cast<_CvSndr&&>(__sndr), this))
+    {}
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    template <class _Tag, class... _As>
+    _CCCL_HOST_DEVICE void __complete(_Tag, _As&&... __as) noexcept
+    {
+      if constexpr (_CUDA_VSTD::is_same_v<_Tag, _SetTag>)
+      {
+        _CUDAX_TRY( //
+          ({ //
+            // Store the results so the lvalue refs we pass to the function
+            // will be valid for the duration of the async op.
+            auto& __tupl = __result_.template __emplace<__decayed_tuple<_As...>>(static_cast<_As&&>(__as)...);
+            if constexpr (!__is_error<completion_signatures>)
+            {
+              // Call the function with the results and connect the resulting
+              // sender, storing the operation state in __opstate2_.
+              auto& __nextop = __opstate2_.__emplace_from(
+                __async::connect, __tupl.__apply(static_cast<_Fn&&>(__fn_), __tupl), __async::__rcvr_ref(__rcvr_));
+              __async::start(__nextop);
+            }
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+      else
+      {
+        // Forward the completion to the receiver unchanged.
+        _Tag()(static_cast<_Rcvr&&>(__rcvr_), static_cast<_As&&>(__as)...);
+      }
+    }
+
+    template <class... _As>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_As&&... __as) noexcept
+    {
+      __complete(set_value_t(), static_cast<_As&&>(__as)...);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+    {
+      __complete(set_error_t(), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept
+    {
+      __complete(set_stopped_t());
+    }
+  };
+
+  /// @brief The `let_(value|error|stopped)` sender.
+  /// @tparam _Sndr The predecessor sender.
+  /// @tparam _Fn The function to be called when the predecessor sender
+  /// completes.
+  template <class _Sndr, class _Fn>
+  struct __sndr_t
+  {
+    using sender_concept = sender_t;
+    _CCCL_NO_UNIQUE_ADDRESS _LetTag __tag_;
+    _Fn __fn_;
+    _Sndr __sndr_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && noexcept(
+      __nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) -> __opstate_t<_Rcvr, _Sndr, _Fn>
+    {
+      return __opstate_t<_Rcvr, _Sndr, _Fn>(
+        static_cast<_Sndr&&>(__sndr_), static_cast<_Fn&&>(__fn_), static_cast<_Rcvr&&>(__rcvr));
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& noexcept( //
+      __nothrow_constructible<__opstate_t<_Rcvr, const _Sndr&, _Fn>,
+                              const _Sndr&,
+                              const _Fn&,
+                              _Rcvr>) //
+      -> __opstate_t<_Rcvr, const _Sndr&, _Fn>
+    {
+      return __opstate_t<_Rcvr, const _Sndr&, _Fn>(__sndr_, __fn_, static_cast<_Rcvr&&>(__rcvr));
+    }
+
+    _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+    {
+      return __async::get_env(__sndr_);
+    }
+  };
+
+  template <class _Fn>
+  struct __closure_t
+  {
+    using _LetTag = decltype(__detail::__let_tag<_Disposition>());
+    _Fn __fn_;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr) const //
+      -> __call_result_t<_LetTag, _Sndr, _Fn>
+    {
+      return _LetTag()(static_cast<_Sndr&&>(__sndr), __fn_);
+    }
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, const __closure_t& __self) //
+      -> __call_result_t<_LetTag, _Sndr, _Fn>
+    {
+      return _LetTag()(static_cast<_Sndr&&>(__sndr), __self.__fn_);
+    }
+  };
+
+public:
+  template <class _Sndr, class _Fn>
+  _CCCL_HOST_DEVICE __sndr_t<_Sndr, _Fn> operator()(_Sndr __sndr, _Fn __fn) const
+  {
+    // If the incoming sender is non-dependent, we can check the completion
+    // signatures of the composed sender immediately.
+    if constexpr (__is_non_dependent_sender<_Sndr>)
+    {
+      using __completions = completion_signatures_of_t<__sndr_t<_Sndr, _Fn>>;
+      static_assert(__is_completion_signatures<__completions>);
+    }
+    return __sndr_t<_Sndr, _Fn>{{}, static_cast<_Fn&&>(__fn), static_cast<_Sndr&&>(__sndr)};
+  }
+
+  template <class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Fn __fn) const noexcept
+  {
+    return __closure_t<_Fn>{static_cast<_Fn&&>(__fn)};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct let_value_t : __let<__value>
+{
+} let_value{};
+
+_CCCL_GLOBAL_CONSTANT struct let_error_t : __let<__error>
+{
+} let_error{};
+
+_CCCL_GLOBAL_CONSTANT struct let_stopped_t : __let<__stopped>
+{
+} let_stopped{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/meta.cuh b/cudax/include/cuda/experimental/__async/meta.cuh
new file mode 100644
index 0000000000..78081cba74
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/meta.cuh
@@ -0,0 +1,753 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_META
+#define __CUDAX_ASYNC_DETAIL_META
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_base_of.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/experimental/__async/config.cuh>
+
+#if __cpp_lib_three_way_comparison
+#  include <compare>
+#endif
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wgnu-string-literal-operator-template")
+_CCCL_DIAG_SUPPRESS_GCC("-Wnon-template-friend")
+
+namespace cuda::experimental::__async
+{
+template <class _Ret, class... _Args>
+using __fn_t = _Ret(_Args...);
+
+template <class _Ret, class... _Args>
+using __nothrow_fn_t = _Ret(_Args...) noexcept;
+
+template <class _Ty>
+_Ty&& __declval() noexcept;
+
+template <class...>
+using __mvoid = void;
+
+template <class _Ty>
+struct __mtype
+{
+  using type = _Ty;
+};
+
+template <class _Ty>
+using __t = typename _Ty::type;
+
+template <class... _Ts>
+struct __mlist;
+
+template <auto _Val>
+struct __mvalue
+{
+  static constexpr auto __value = _Val;
+};
+
+// A separate __mbool template is needed in addition to __mvalue
+// because of an EDG bug in the handling of auto template parameters.
+template <bool _Val>
+struct __mbool
+{
+  static constexpr auto __value = _Val;
+};
+
+using __mtrue  = __mbool<true>;
+using __mfalse = __mbool<false>;
+
+template <auto... _Vals>
+struct __mvalues;
+
+template <size_t... _Vals>
+struct __moffsets;
+
+template <class... _Bools>
+using __mand = __mbool<(_Bools::__value && ...)>;
+
+template <class... _Bools>
+using __mor = __mbool<(_Bools::__value || ...)>;
+
+template <size_t... _Idx>
+using __mindices = _CUDA_VSTD::index_sequence<_Idx...>*;
+
+template <size_t Count>
+using __mmake_indices = _CUDA_VSTD::make_index_sequence<Count>*;
+
+template <class... _Ts>
+using __mmake_indices_for = _CUDA_VSTD::make_index_sequence<sizeof...(_Ts)>*;
+
+constexpr size_t __mpow2(size_t __size) noexcept
+{
+  --__size;
+  __size |= __size >> 1;
+  __size |= __size >> 2;
+  __size |= __size >> 4;
+  __size |= __size >> 8;
+  if constexpr (sizeof(__size) >= 4)
+  {
+    __size |= __size >> 16;
+  }
+  if constexpr (sizeof(__size) >= 8)
+  {
+    __size |= __size >> 32;
+  }
+  return ++__size;
+}
+
+template <class _Ty>
+constexpr _Ty __mmin(_Ty __lhs, _Ty __rhs) noexcept
+{
+  return __lhs < __rhs ? __lhs : __rhs;
+}
+
+template <class _Ty>
+constexpr int __mcompare(_Ty __lhs, _Ty __rhs) noexcept
+{
+  return __lhs < __rhs ? -1 : __lhs > __rhs ? 1 : 0;
+}
+
+template <size_t _Len>
+struct __mstring
+{
+  template <size_t _Ny, size_t... _Is>
+  constexpr __mstring(const char (&__str)[_Ny], __mindices<_Is...>) noexcept
+      : __len_{_Ny}
+      , __what_{(_Is < _Ny ? __str[_Is] : '\0')...}
+  {}
+
+  template <size_t _Ny>
+  constexpr __mstring(const char (&__str)[_Ny], int = 0) noexcept
+      : __mstring{__str, __mmake_indices<_Len>{}}
+  {}
+
+  constexpr auto length() const noexcept -> size_t
+  {
+    return __len_;
+  }
+
+  template <size_t _OtherLen>
+  constexpr int compare(const __mstring<_OtherLen>& __other) const noexcept
+  {
+    size_t const len = __mmin(__len_, __other.__len_);
+    for (size_t i = 0; i < len; ++i)
+    {
+      if (auto const cmp = __mcompare(__what_[i], __other.__what_[i]))
+      {
+        return cmp;
+      }
+    }
+    return __mcompare(__len_, __other.__len_);
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator==(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return __len_ == __other.__len_ && compare(__other) == 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator!=(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return !operator==(__other);
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator<(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) < 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator>(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) > 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator<=(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) <= 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator>=(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) >= 0;
+  }
+
+  size_t __len_;
+  char __what_[_Len];
+};
+
+template <size_t _Len>
+__mstring(const char (&__str)[_Len]) -> __mstring<_Len>;
+
+template <size_t _Len>
+__mstring(const char (&__str)[_Len], int) -> __mstring<__mpow2(_Len)>;
+
+template <class _Ty>
+constexpr auto __mnameof() noexcept
+{
+#if defined(_CCCL_COMPILER_MSVC)
+  return __mstring{__FUNCSIG__, 0};
+#else
+  return __mstring{__PRETTY_FUNCTION__, 0};
+#endif
+}
+
+// The following must be left undefined
+template <class...>
+struct _DIAGNOSTIC;
+
+struct _WHERE;
+
+struct _IN_ALGORITHM;
+
+struct _WHAT;
+
+struct _WITH_FUNCTION;
+
+struct _WITH_SENDER;
+
+struct _WITH_ARGUMENTS;
+
+struct _WITH_QUERY;
+
+struct _WITH_ENVIRONMENT;
+
+template <class>
+struct _WITH_COMPLETION_SIGNATURE;
+
+struct _FUNCTION_IS_NOT_CALLABLE;
+
+struct _UNKNOWN;
+
+struct _SENDER_HAS_TOO_MANY_SUCCESS_COMPLETIONS;
+
+template <class... _Sigs>
+struct _WITH_COMPLETIONS
+{};
+
+struct __merror_base
+{
+  constexpr friend bool __ustdex_unhandled_error(void*) noexcept
+  {
+    return true;
+  }
+};
+
+template <class... _What>
+struct _ERROR : __merror_base
+{
+  template <class...>
+  using __f = _ERROR;
+
+  _ERROR operator+();
+
+  template <class _Ty>
+  _ERROR& operator,(_Ty&);
+
+  template <class... _With>
+  _ERROR<_What..., _With...>& with(_ERROR<_With...>&);
+};
+
+constexpr bool __ustdex_unhandled_error(...) noexcept
+{
+  return false;
+}
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_error = false;
+
+template <class... _What>
+_CCCL_INLINE_VAR constexpr bool __is_error<_ERROR<_What...>> = true;
+
+template <class... _What>
+_CCCL_INLINE_VAR constexpr bool __is_error<_ERROR<_What...>&> = true;
+
+// True if any of the types in _Ts... are errors; false otherwise.
+template <class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __contains_error =
+#if defined(_CCCL_COMPILER_MSVC)
+  (__is_error<_Ts> || ...);
+#else
+  __ustdex_unhandled_error(static_cast<__mlist<_Ts...>*>(nullptr));
+#endif
+
+template <class... _Ts>
+using __find_error = decltype(+(__declval<_Ts&>(), ..., __declval<_ERROR<_UNKNOWN>&>()));
+
+template <template <class...> class _Fn, class... _Ts>
+using __minvoke_q = _Fn<_Ts...>;
+
+template <class _Fn, class... _Ts>
+using __minvoke = typename _Fn::template __f<_Ts...>;
+
+template <class _Fn, class _Ty>
+using __minvoke1 = typename _Fn::template __f<_Ty>;
+
+template <class _Fn, template <class...> class _Cy, class... _Ts, class _Ret = __minvoke<_Fn, _Ts...>>
+auto __apply_fn(_Cy<_Ts...>*) -> _Ret;
+
+template <template <class...> class _Fn, template <class...> class _Cy, class... _Ts, class _Ret = _Fn<_Ts...>>
+auto __apply_fn_q(_Cy<_Ts...>*) -> _Ret;
+
+template <class _Fn, class _List>
+using __mapply = decltype(__async::__apply_fn<_Fn>(static_cast<_List*>(nullptr)));
+
+template <template <class...> class _Fn, class _List>
+using __mapply_q = decltype(__async::__apply_fn_q<_Fn>(static_cast<_List*>(nullptr)));
+
+template <class _Ty, class...>
+using __mfront = _Ty;
+
+template <template <class...> class _Fn, class _List, class _Enable = void>
+_CCCL_INLINE_VAR constexpr bool __mvalid_ = false;
+
+template <template <class...> class _Fn, class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __mvalid_<_Fn, __mlist<_Ts...>, __mvoid<_Fn<_Ts...>>> = true;
+
+template <template <class...> class _Fn, class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __mvalid_q = __mvalid_<_Fn, __mlist<_Ts...>>;
+
+template <class _Fn, class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __mvalid = __mvalid_<_Fn::template __f, __mlist<_Ts...>>;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr auto __v = _Tp::__value;
+
+template <auto _Value>
+_CCCL_INLINE_VAR constexpr auto __v<__mvalue<_Value>> = _Value;
+
+template <bool _Value>
+_CCCL_INLINE_VAR constexpr auto __v<__mbool<_Value>> = _Value;
+
+template <class _Tp, _Tp _Value>
+_CCCL_INLINE_VAR constexpr auto __v<_CUDA_VSTD::integral_constant<_Tp, _Value>> = _Value;
+
+struct __midentity
+{
+  template <class _Ty>
+  using __f = _Ty;
+};
+
+template <class _Ty>
+struct __malways
+{
+  template <class...>
+  using __f = _Ty;
+};
+
+template <class _Ty>
+struct __malways1
+{
+  template <class>
+  using __f = _Ty;
+};
+
+template <bool>
+struct __mif_
+{
+  template <class _Then, class...>
+  using __f = _Then;
+};
+
+template <>
+struct __mif_<false>
+{
+  template <class _Then, class _Else>
+  using __f = _Else;
+};
+
+template <bool If, class _Then = void, class... _Else>
+using __mif = typename __mif_<If>::template __f<_Then, _Else...>;
+
+template <class If, class _Then = void, class... _Else>
+using __mif_t = typename __mif_<__v<If>>::template __f<_Then, _Else...>;
+
+template <bool _Error>
+struct __midentity_or_error_with_
+{
+  template <class _Ty, class... _With>
+  using __f = _Ty;
+};
+
+template <>
+struct __midentity_or_error_with_<true>
+{
+  template <class _Ty, class... _With>
+  using __f = decltype(__declval<_Ty&>().with(__declval<_ERROR<_With...>&>()));
+};
+
+template <class _Ty, class... _With>
+using __midentity_or_error_with = __minvoke<__midentity_or_error_with_<__is_error<_Ty>>, _Ty, _With...>;
+
+template <bool>
+struct __mtry_;
+
+template <>
+struct __mtry_<false>
+{
+  template <template <class...> class _Fn, class... _Ts>
+  using __g = _Fn<_Ts...>;
+
+  template <class _Fn, class... _Ts>
+  using __f = typename _Fn::template __f<_Ts...>;
+};
+
+template <>
+struct __mtry_<true>
+{
+  template <template <class...> class _Fn, class... _Ts>
+  using __g = __find_error<_Ts...>;
+
+  template <class _Fn, class... _Ts>
+  using __f = __find_error<_Fn, _Ts...>;
+};
+
+template <class _Fn, class... _Ts>
+using __mtry_invoke = typename __mtry_<__contains_error<_Ts...>>::template __f<_Fn, _Ts...>;
+
+template <template <class...> class _Fn, class... _Ts>
+using __mtry_invoke_q = typename __mtry_<__contains_error<_Ts...>>::template __g<_Fn, _Ts...>;
+
+template <class _Fn>
+struct __mtry
+{
+  template <class... _Ts>
+  using __f = __mtry_invoke<_Fn, _Ts...>;
+};
+
+template <class _Fn>
+struct __mpoly
+{
+  template <class... _Ts>
+  using __f = typename __mtry_<(sizeof...(_Ts) == ~0ul)>::template __f<_Fn, _Ts...>;
+};
+
+template <template <class...> class _Fn>
+struct __mpoly_q
+{
+  template <class... _Ts>
+  using __f = typename __mtry_<(sizeof...(_Ts) == ~0ul)>::template __g<_Fn, _Ts...>;
+};
+
+template <template <class...> class _Fn, class... _Default>
+struct __mquote;
+
+template <template <class...> class _Fn>
+struct __mquote<_Fn>
+{
+  template <class... _Ts>
+  using __f = _Fn<_Ts...>;
+};
+
+template <template <class...> class _Fn, class _Default>
+struct __mquote<_Fn, _Default>
+{
+  template <class... _Ts>
+  using __f = typename __mif<__mvalid_q<_Fn, _Ts...>, __mquote<_Fn>, __malways<_Default>>::template __f<_Ts...>;
+};
+
+template <template <class...> class _Fn, class... _Default>
+struct __mtry_quote;
+
+template <template <class...> class _Fn>
+struct __mtry_quote<_Fn>
+{
+  template <class... _Ts>
+  using __f = typename __mtry_<__contains_error<_Ts...>>::template __g<_Fn, _Ts...>;
+};
+
+template <template <class...> class _Fn, class _Default>
+struct __mtry_quote<_Fn, _Default>
+{
+  template <class... _Ts>
+  using __f = typename __mif<__mvalid_q<_Fn, _Ts...>, __mtry_quote<_Fn>, __malways<_Default>>::template __f<_Ts...>;
+};
+
+template <class _Fn, class... _Ts>
+struct __mbind_front
+{
+  template <class... _Us>
+  using __f = __minvoke<_Fn, _Ts..., _Us...>;
+};
+
+template <class _Fn, class _Ty>
+struct __mbind_front1
+{
+  template <class... _Us>
+  using __f = __minvoke<_Fn, _Ty, _Us...>;
+};
+
+template <template <class...> class _Fn, class... _Ts>
+struct __mbind_front_q
+{
+  template <class... _Us>
+  using __f = __minvoke_q<_Fn, _Ts..., _Us...>;
+};
+
+template <class _Fn, class... _Ts>
+struct __mbind_back
+{
+  template <class... _Us>
+  using __f = __minvoke<_Fn, _Us..., _Ts...>;
+};
+
+template <template <class...> class _Fn, class... _Ts>
+struct __mbind_back_q
+{
+  template <class... _Us>
+  using __f = __minvoke_q<_Fn, _Us..., _Ts...>;
+};
+
+#if defined(__cpp_pack_indexing)
+
+template <class _Np, class... _Ts>
+using __m_at = _Ts...[__v<_Np>];
+
+template <size_t _Np, class... _Ts>
+using __m_at_c = _Ts...[_Np];
+
+#elif __has_builtin(__type_pack_element)
+
+template <bool>
+struct __m_at_
+{
+  template <class _Np, class... _Ts>
+  using __f = __type_pack_element<__v<_Np>, _Ts...>;
+};
+
+template <class _Np, class... _Ts>
+using __m_at = __minvoke<__m_at_<__v<_Np> == ~0ul>, _Np, _Ts...>;
+
+template <size_t _Np, class... _Ts>
+using __m_at_c = __minvoke<__m_at_<_Np == ~0ul>, __mvalue<_Np>, _Ts...>;
+
+template <size_t _Idx>
+struct __mget
+{
+  template <class... _Ts>
+  using __f = __m_at<__mvalue<_Idx>, _Ts...>;
+};
+
+#else
+
+template <size_t _Idx>
+struct __mget
+{
+  template <class, class, class, class, class... _Ts>
+  using __f = __minvoke<__mtry_<sizeof...(_Ts) == ~0ull>, __mget<_Idx - 4>, _Ts...>;
+};
+
+template <>
+struct __mget<0>
+{
+  template <class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <>
+struct __mget<1>
+{
+  template <class, class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <>
+struct __mget<2>
+{
+  template <class, class, class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <>
+struct __mget<3>
+{
+  template <class, class, class, class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <class _Np, class... _Ts>
+using __m_at = __minvoke<__mget<__v<_Np>>, _Ts...>;
+
+template <size_t _Np, class... _Ts>
+using __m_at_c = __minvoke<__mget<_Np>, _Ts...>;
+
+#endif
+
+template <class _First, class _Second>
+struct __mpair
+{
+  using first  = _First;
+  using second = _Second;
+};
+
+template <class _Pair>
+using __mfirst = typename _Pair::first;
+
+template <class _Pair>
+using __msecond = typename _Pair::second;
+
+template <template <class...> class _Second, template <class...> class _First>
+struct __mcompose_q
+{
+  template <class... _Ts>
+  using __f = _Second<_First<_Ts...>>;
+};
+
+struct __mcount
+{
+  template <class... _Ts>
+  using __f = __mvalue<sizeof...(_Ts)>;
+};
+
+template <bool>
+struct __mconcat_
+{
+  template <class... _Ts,
+            template <class...> class _Ap = __mlist,
+            class... _As,
+            template <class...> class _Bp = __mlist,
+            class... _Bs,
+            template <class...> class _Cp = __mlist,
+            class... _Cs,
+            template <class...> class _Dp = __mlist,
+            class... _Ds,
+            class... _Tail>
+  static auto
+  __f(__mlist<_Ts...>*,
+      _Ap<_As...>*,
+      _Bp<_Bs...>* = nullptr,
+      _Cp<_Cs...>* = nullptr,
+      _Dp<_Ds...>* = nullptr,
+      _Tail*... __tail)
+    -> decltype(__mconcat_<(sizeof...(_Tail) == 0)>::__f(
+      static_cast<__mlist<_Ts..., _As..., _Bs..., _Cs..., _Ds...>*>(nullptr), __tail...));
+};
+
+template <>
+struct __mconcat_<true>
+{
+  template <class... _As>
+  static auto __f(__mlist<_As...>*) -> __mlist<_As...>;
+};
+
+template <class _Continuation = __mquote<__mlist>>
+struct __mconcat_into
+{
+  template <class... _Args>
+  using __f =
+    __mapply<_Continuation, decltype(__mconcat_<(sizeof...(_Args) == 0)>::__f({}, static_cast<_Args*>(nullptr)...))>;
+};
+
+template <template <class...> class _Continuation = __mlist>
+struct __mconcat_into_q
+{
+  template <class... _Args>
+  using __f =
+    __mapply_q<_Continuation, decltype(__mconcat_<(sizeof...(_Args) == 0)>::__f({}, static_cast<_Args*>(nullptr)...))>;
+};
+
+// The following must be super-fast to compile, so use an intrinsic directly if it is available
+#if defined(_LIBCUDACXX_IS_BASE_OF) && !defined(_LIBCUDACXX_USE_IS_BASE_OF_FALLBACK)
+
+template <class _Set, class... _Ty>
+_CCCL_INLINE_VAR constexpr bool __mset_contains = (_LIBCUDACXX_IS_BASE_OF(__mtype<_Ty>, _Set) && ...);
+
+#else
+
+template <class _Set, class... _Ty>
+_CCCL_INLINE_VAR constexpr bool __mset_contains = (_CUDA_VSTD::is_base_of_v<__mtype<_Ty>, _Set> && ...);
+
+#endif
+
+namespace __set
+{
+template <class... _Ts>
+struct __inherit
+{};
+
+template <class _Ty, class... _Ts>
+struct __inherit<_Ty, _Ts...>
+    : __mtype<_Ty>
+    , __inherit<_Ts...>
+{};
+
+template <class... _Set>
+auto operator+(__inherit<_Set...>&) -> __inherit<_Set...>;
+
+template <class... _Set, class _Ty>
+auto operator%(__inherit<_Set...>&, __mtype<_Ty>&) //
+  -> __mif< //
+    __mset_contains<__inherit<_Set...>, _Ty>,
+    __inherit<_Set...>,
+    __inherit<_Ty, _Set...>>&;
+
+template <class _ExpectedSet>
+struct __eq
+{
+  static constexpr size_t __count = __v<__mapply<__mcount, _ExpectedSet>>;
+
+  template <class... _Ts>
+  using __f = __mbool<sizeof...(_Ts) == __count && __mset_contains<_ExpectedSet, _Ts...>>;
+};
+} // namespace __set
+
+template <class... _Ts>
+using __mset = __set::__inherit<_Ts...>;
+
+template <class _Set, class... _Ts>
+using __mset_insert = decltype(+(__declval<_Set&>() % ... % __declval<__mtype<_Ts>&>()));
+
+template <class... _Ts>
+using __mmake_set = __mset_insert<__mset<>, _Ts...>;
+
+template <class _Set1, class _Set2>
+_CCCL_INLINE_VAR constexpr bool __mset_eq = __v<__mapply<__set::__eq<_Set1>, _Set2>>;
+
+template <class _Fn>
+struct __munique
+{
+  template <class... _Ts>
+  using __f = __minvoke<__mmake_set<_Ts...>, _Fn>;
+};
+
+template <class _Ty>
+struct __msingle_or
+{
+  template <class _Uy = _Ty>
+  using __f = _Uy;
+};
+} // namespace cuda::experimental::__async
+
+_CCCL_DIAG_POP
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/prologue.cuh b/cudax/include/cuda/experimental/__async/prologue.cuh
new file mode 100644
index 0000000000..179d4416d7
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/prologue.cuh
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#include <cuda/std/detail/__config>
+
+#if defined(_CUDAX_ASYNC_PROLOGUE_INCLUDED)
+#  __error multiple inclusion of prologue.cuh
+#endif
+
+#define _CUDAX_ASYNC_PROLOGUE_INCLUDED
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_GCC("-Wsubobject-linkage")
+_CCCL_DIAG_SUPPRESS_MSVC(4848) // [[no_unique_address]] prior to C++20 as a vendor extension
diff --git a/cudax/include/cuda/experimental/__async/queries.cuh b/cudax/include/cuda/experimental/__async/queries.cuh
new file mode 100644
index 0000000000..557c8d5b59
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/queries.cuh
@@ -0,0 +1,167 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_QUERIES
+#define __CUDAX_ASYNC_DETAIL_QUERIES
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/allocator.h>
+
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/stop_token.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Ty, class _Query>
+auto __query_result_() -> decltype(__declval<_Ty>().__query(_Query()));
+
+template <class _Ty, class _Query>
+using __query_result_t = decltype(__query_result_<_Ty, _Query>());
+
+template <class _Ty, class _Query>
+_CCCL_INLINE_VAR constexpr bool __queryable = __mvalid_q<__query_result_t, _Ty, _Query>;
+
+#if defined(__CUDA_ARCH__)
+template <class _Ty, class _Query>
+_CCCL_INLINE_VAR constexpr bool __nothrow_queryable = true;
+#else
+template <class _Ty, class _Query>
+using __nothrow_queryable_ = __mif<noexcept(__declval<_Ty>().__query(_Query()))>;
+
+template <class _Ty, class _Query>
+_CCCL_INLINE_VAR constexpr bool __nothrow_queryable = __mvalid_q<__nothrow_queryable_, _Ty, _Query>;
+#endif
+
+_CCCL_GLOBAL_CONSTANT struct get_allocator_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+
+  _CCCL_HOST_DEVICE auto operator()(__ignore) const noexcept -> _CUDA_VSTD::allocator<void>
+  {
+    return {};
+  }
+} get_allocator{};
+
+_CCCL_GLOBAL_CONSTANT struct get_stop_token_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+
+  _CCCL_HOST_DEVICE auto operator()(__ignore) const noexcept -> never_stop_token
+  {
+    return {};
+  }
+} get_stop_token{};
+
+template <class _Ty>
+using stop_token_of_t = __decay_t<__call_result_t<get_stop_token_t, _Ty>>;
+
+template <class _Tag>
+struct get_completion_scheduler_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+};
+
+template <class _Tag>
+_CCCL_GLOBAL_CONSTANT get_completion_scheduler_t<_Tag> get_completion_scheduler{};
+
+_CCCL_GLOBAL_CONSTANT struct get_scheduler_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+} get_scheduler{};
+
+_CCCL_GLOBAL_CONSTANT struct get_delegatee_scheduler_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+} get_delegatee_scheduler{};
+
+enum class forward_progress_guarantee
+{
+  concurrent,
+  parallel,
+  weakly_parallel
+};
+
+_CCCL_GLOBAL_CONSTANT struct get_forward_progress_guarantee_t
+{
+  template <class _Sch>
+  _CCCL_HOST_DEVICE auto operator()(const _Sch& __sch) const noexcept //
+    -> decltype(__async::__decay_copy(__sch.__query(*this)))
+  {
+    static_assert(noexcept(__sch.__query(*this)));
+    return __sch.__query(*this);
+  }
+
+  _CCCL_HOST_DEVICE auto operator()(__ignore) const noexcept -> forward_progress_guarantee
+  {
+    return forward_progress_guarantee::weakly_parallel;
+  }
+} get_forward_progress_guarantee{};
+
+_CCCL_GLOBAL_CONSTANT struct get_domain_t
+{
+  template <class _Sch>
+  _CCCL_HOST_DEVICE constexpr auto operator()(const _Sch& __sch) const noexcept //
+    -> decltype(__async::__decay_copy(__sch.__query(*this)))
+  {
+    return {};
+  }
+} get_domain{};
+
+template <class _Sch>
+using domain_of_t = __call_result_t<get_domain_t, _Sch>;
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/rcvr_ref.cuh b/cudax/include/cuda/experimental/__async/rcvr_ref.cuh
new file mode 100644
index 0000000000..d380db189b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/rcvr_ref.cuh
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_RCVR_REF
+#define __CUDAX_ASYNC_DETAIL_RCVR_REF
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+
+template <class _Rcvr>
+constexpr _Rcvr* __rcvr_ref(_Rcvr& __rcvr) noexcept
+{
+  return &__rcvr;
+}
+
+template <class _Rcvr>
+constexpr _Rcvr* __rcvr_ref(_Rcvr* __rcvr) noexcept
+{
+  return __rcvr;
+}
+
+template <class _Rcvr>
+using __rcvr_ref_t = decltype(__async::__rcvr_ref(__declval<_Rcvr>()));
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/rcvr_with_env.cuh b/cudax/include/cuda/experimental/__async/rcvr_with_env.cuh
new file mode 100644
index 0000000000..308d1f8d9b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/rcvr_with_env.cuh
@@ -0,0 +1,141 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the _Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: _Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_RCVR_WITH_ENV
+#define __CUDAX_ASYNC_DETAIL_RCVR_WITH_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Rcvr, class _Env>
+struct __rcvr_with_env_t : _Rcvr
+{
+  using __env_t = __rcvr_with_env_t const&;
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto __rcvr() noexcept -> _Rcvr&
+  {
+    return *this;
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto __rcvr() const noexcept -> const _Rcvr&
+  {
+    return *this;
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env_t
+  {
+    return __env_t{*this};
+  }
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    if constexpr (__queryable<_Env, _Query>)
+    {
+      return (__env_);
+    }
+    else if constexpr (__queryable<env_of_t<_Rcvr>, _Query>)
+    {
+      return __async::get_env(static_cast<const _Rcvr&>(*this));
+    }
+  }
+
+  template <class _Query, class _Self = __rcvr_with_env_t>
+  using _1st_env_t = decltype(__declval<const _Self&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<_1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<_1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+
+  _Env __env_;
+};
+
+template <class _Rcvr, class _Env>
+struct __rcvr_with_env_t<_Rcvr*, _Env>
+{
+  using __env_t = __rcvr_with_env_t const&;
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto __rcvr() const noexcept -> _Rcvr*
+  {
+    return __rcvr_;
+  }
+
+  template <class... _As>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE void set_value(_As&&... __as) && noexcept
+  {
+    __async::set_value(__rcvr_, static_cast<_As&&>(__as)...);
+  }
+
+  template <class _Error>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE void set_error(_Error&& __error) && noexcept
+  {
+    __async::set_error(__rcvr_, static_cast<_Error&&>(__error));
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE void set_stopped() && noexcept
+  {
+    __async::set_stopped(__rcvr_);
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env_t
+  {
+    return __env_t{*this};
+  }
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    if constexpr (__queryable<_Env, _Query>)
+    {
+      return (__env_);
+    }
+    else if constexpr (__queryable<env_of_t<_Rcvr>, _Query>)
+    {
+      return __async::get_env(__rcvr_);
+    }
+  }
+
+  template <class _Query, class _Self = __rcvr_with_env_t>
+  using _1st_env_t = decltype(__declval<const _Self&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<_1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<_1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+
+  _Rcvr* __rcvr_;
+  _Env __env_;
+};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/read_env.cuh b/cudax/include/cuda/experimental/__async/read_env.cuh
new file mode 100644
index 0000000000..4f7848611c
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/read_env.cuh
@@ -0,0 +1,155 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_READ_ENV
+#define __CUDAX_ASYNC_DETAIL_READ_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct THE_CURRENT_ENVIRONMENT_LACKS_THIS_QUERY;
+
+struct read_env_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  template <class _Query, class _Env>
+  using __error_env_lacks_query = //
+    _ERROR<_WHERE(_IN_ALGORITHM, read_env_t),
+           _WHAT(THE_CURRENT_ENVIRONMENT_LACKS_THIS_QUERY),
+           _WITH_QUERY(_Query),
+           _WITH_ENVIRONMENT(_Env)>;
+
+  struct __completions_fn
+  {
+    template <class _Query, class _Env>
+    using __f = _CUDA_VSTD::_If<
+      __nothrow_callable<_Query, _Env>,
+      completion_signatures<set_value_t(__call_result_t<_Query, _Env>)>,
+      completion_signatures<set_value_t(__call_result_t<_Query, _Env>), set_error_t(::std::exception_ptr)>>;
+  };
+
+  template <class _Rcvr, class _Query>
+  struct __opstate_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = //
+      __minvoke<
+        _CUDA_VSTD::
+          _If<__callable<_Query, env_of_t<_Rcvr>>, __completions_fn, __error_env_lacks_query<_Query, env_of_t<_Rcvr>>>,
+        _Query,
+        env_of_t<_Rcvr>>;
+
+    _Rcvr __rcvr_;
+
+    _CCCL_HOST_DEVICE explicit __opstate_t(_Rcvr __rcvr)
+        : __rcvr_(static_cast<_Rcvr&&>(__rcvr))
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      // If the query invocation is noexcept, call it directly. Otherwise,
+      // wrap it in a try-catch block and forward the exception to the
+      // receiver.
+      if constexpr (__nothrow_callable<_Query, env_of_t<_Rcvr>>)
+      {
+        // This looks like a use after move, but `set_value` takes its
+        // arguments by forwarding reference, so it's safe.
+        __async::set_value(static_cast<_Rcvr&&>(__rcvr_), _Query()(__async::get_env(__rcvr_)));
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ( //
+            { //
+              __async::set_value(static_cast<_Rcvr&&>(__rcvr_), _Query()(__async::get_env(__rcvr_)));
+            }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+    }
+  };
+
+  // This makes read_env a dependent sender:
+  template <class _Query>
+  struct __opstate_t<receiver_archetype, _Query>
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = dependent_completions;
+    _CCCL_HOST_DEVICE explicit __opstate_t(receiver_archetype);
+    _CCCL_HOST_DEVICE void start() noexcept;
+  };
+
+  template <class _Query>
+  struct __sndr_t;
+
+public:
+  /// @brief Returns a sender that, when connected to a receiver and started,
+  /// invokes the query with the receiver's environment and forwards the result
+  /// to the receiver's `set_value` member.
+  template <class _Query>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr __sndr_t<_Query> operator()(_Query) const noexcept;
+};
+
+template <class _Query>
+struct read_env_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS read_env_t __tag;
+  _CCCL_NO_UNIQUE_ADDRESS _Query __query;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const noexcept(__nothrow_movable<_Rcvr>) -> __opstate_t<_Rcvr, _Query>
+  {
+    return __opstate_t<_Rcvr, _Query>{static_cast<_Rcvr&&>(__rcvr)};
+  }
+};
+
+template <class _Query>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr read_env_t::__sndr_t<_Query>
+read_env_t::operator()(_Query __query) const noexcept
+{
+  return __sndr_t<_Query>{{}, __query};
+}
+
+_CCCL_GLOBAL_CONSTANT read_env_t read_env{};
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/run_loop.cuh b/cudax/include/cuda/experimental/__async/run_loop.cuh
new file mode 100644
index 0000000000..4b2f61e9cd
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/run_loop.cuh
@@ -0,0 +1,274 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_RUN_LOOP
+#define __CUDAX_ASYNC_DETAIL_RUN_LOOP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+// libcu++ does not have <cuda/std/mutex> or <cuda/std/condition_variable>
+#if !defined(__CUDA_ARCH__)
+
+#  include <cuda/experimental/__async/completion_signatures.cuh>
+#  include <cuda/experimental/__async/env.cuh>
+#  include <cuda/experimental/__async/exception.cuh>
+#  include <cuda/experimental/__async/queries.cuh>
+#  include <cuda/experimental/__async/utility.cuh>
+
+#  include <condition_variable>
+#  include <mutex>
+
+#  include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+class run_loop;
+
+struct __task : __immovable
+{
+  using __execute_fn_t = void(__task*) noexcept;
+
+  __task() = default;
+
+  _CCCL_HOST_DEVICE explicit __task(__task* __next, __task* __tail) noexcept
+      : __next_{__next}
+      , __tail_{__tail}
+  {}
+
+  _CCCL_HOST_DEVICE explicit __task(__task* __next, __execute_fn_t* __execute) noexcept
+      : __next_{__next}
+      , __execute_fn_{__execute}
+  {}
+
+  __task* __next_ = this;
+
+  union
+  {
+    __task* __tail_ = nullptr;
+    __execute_fn_t* __execute_fn_;
+  };
+
+  _CCCL_HOST_DEVICE void __execute() noexcept
+  {
+    (*__execute_fn_)(this);
+  }
+};
+
+template <class _Rcvr>
+struct __operation : __task
+{
+  run_loop* __loop_;
+  _CCCL_NO_UNIQUE_ADDRESS _Rcvr __rcvr_;
+
+  using completion_signatures = //
+    __async::completion_signatures<set_value_t(), set_error_t(::std::exception_ptr), set_stopped_t()>;
+
+  _CCCL_HOST_DEVICE static void __execute_impl(__task* __p) noexcept
+  {
+    auto& __rcvr = static_cast<__operation*>(__p)->__rcvr_;
+    _CUDAX_TRY( //
+      ({ //
+        if (get_stop_token(get_env(__rcvr)).stop_requested())
+        {
+          set_stopped(static_cast<_Rcvr&&>(__rcvr));
+        }
+        else
+        {
+          set_value(static_cast<_Rcvr&&>(__rcvr));
+        }
+      }),
+      _CUDAX_CATCH(...)( //
+        { //
+          set_error(static_cast<_Rcvr&&>(__rcvr), ::std::current_exception());
+        }))
+  }
+
+  _CCCL_HOST_DEVICE explicit __operation(__task* __tail_) noexcept
+      : __task{this, __tail_}
+  {}
+
+  _CCCL_HOST_DEVICE __operation(__task* __next_, run_loop* __loop, _Rcvr __rcvr)
+      : __task{__next_, &__execute_impl}
+      , __loop_{__loop}
+      , __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+  {}
+
+  _CCCL_HOST_DEVICE void start() & noexcept;
+};
+
+class run_loop
+{
+  template <class... _Ts>
+  using __completion_signatures = completion_signatures<_Ts...>;
+
+  template <class>
+  friend struct __operation;
+
+public:
+  run_loop() noexcept
+  {
+    __head.__next_ = __head.__tail_ = &__head;
+  }
+
+  class __scheduler
+  {
+    struct __schedule_task
+    {
+      using __t            = __schedule_task;
+      using __id           = __schedule_task;
+      using sender_concept = sender_t;
+
+      template <class _Rcvr>
+      _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const noexcept -> __operation<_Rcvr>
+      {
+        return {&__loop_->__head, __loop_, static_cast<_Rcvr&&>(__rcvr)};
+      }
+
+    private:
+      friend __scheduler;
+
+      struct __env
+      {
+        run_loop* __loop_;
+
+        template <class _Tag>
+        _CCCL_HOST_DEVICE auto query(get_completion_scheduler_t<_Tag>) const noexcept -> __scheduler
+        {
+          return __loop_->get_scheduler();
+        }
+      };
+
+      _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env
+      {
+        return __env{__loop_};
+      }
+
+      _CCCL_HOST_DEVICE explicit __schedule_task(run_loop* __loop) noexcept
+          : __loop_(__loop)
+      {}
+
+      run_loop* const __loop_;
+    };
+
+    friend run_loop;
+
+    _CCCL_HOST_DEVICE explicit __scheduler(run_loop* __loop) noexcept
+        : __loop_(__loop)
+    {}
+
+    _CCCL_HOST_DEVICE auto query(get_forward_progress_guarantee_t) const noexcept -> forward_progress_guarantee
+    {
+      return forward_progress_guarantee::parallel;
+    }
+
+    run_loop* __loop_;
+
+  public:
+    using scheduler_concept = scheduler_t;
+
+    [[nodiscard]] _CCCL_HOST_DEVICE auto schedule() const noexcept -> __schedule_task
+    {
+      return __schedule_task{__loop_};
+    }
+
+    _CCCL_HOST_DEVICE friend bool operator==(const __scheduler& __a, const __scheduler& __b) noexcept
+    {
+      return __a.__loop_ == __b.__loop_;
+    }
+
+    _CCCL_HOST_DEVICE friend bool operator!=(const __scheduler& __a, const __scheduler& __b) noexcept
+    {
+      return __a.__loop_ != __b.__loop_;
+    }
+  };
+
+  _CCCL_HOST_DEVICE auto get_scheduler() noexcept -> __scheduler
+  {
+    return __scheduler{this};
+  }
+
+  _CCCL_HOST_DEVICE void run();
+
+  _CCCL_HOST_DEVICE void finish();
+
+private:
+  _CCCL_HOST_DEVICE void __push_back(__task* __tsk);
+  _CCCL_HOST_DEVICE auto __pop_front() -> __task*;
+
+  ::std::mutex __mutex{};
+  ::std::condition_variable __cv{};
+  __task __head{};
+  bool __stop = false;
+};
+
+template <class _Rcvr>
+_CCCL_HOST_DEVICE inline void __operation<_Rcvr>::start() & noexcept {
+  _CUDAX_TRY( //
+    ({ //
+      __loop_->__push_back(this); //
+    }), //
+    _CUDAX_CATCH(...)( //
+      { //
+        set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception()); //
+      })) //
+}
+
+_CCCL_HOST_DEVICE inline void run_loop::run()
+{
+  for (__task* __tsk = __pop_front(); __tsk != &__head; __tsk = __pop_front())
+  {
+    __tsk->__execute();
+  }
+}
+
+_CCCL_HOST_DEVICE inline void run_loop::finish()
+{
+  ::std::unique_lock __lock{__mutex};
+  __stop = true;
+  __cv.notify_all();
+}
+
+_CCCL_HOST_DEVICE inline void run_loop::__push_back(__task* __tsk)
+{
+  ::std::unique_lock __lock{__mutex};
+  __tsk->__next_ = &__head;
+  __head.__tail_ = __head.__tail_->__next_ = __tsk;
+  __cv.notify_one();
+}
+
+_CCCL_HOST_DEVICE inline auto run_loop::__pop_front() -> __task*
+{
+  ::std::unique_lock __lock{__mutex};
+  __cv.wait(__lock, [this] {
+    return __head.__next_ != &__head || __stop;
+  });
+  if (__head.__tail_ == __head.__next_)
+  {
+    __head.__tail_ = &__head;
+  }
+  return __async::__exchange(__head.__next_, __head.__next_->__next_);
+}
+} // namespace cuda::experimental::__async
+
+#  include <cuda/experimental/__async/epilogue.cuh>
+
+#endif // !defined(__CUDA_ARCH__)
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/sequence.cuh b/cudax/include/cuda/experimental/__async/sequence.cuh
new file mode 100644
index 0000000000..b7a85a7487
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/sequence.cuh
@@ -0,0 +1,151 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_SEQUENCE
+#define __CUDAX_ASYNC_DETAIL_SEQUENCE
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/lazy.cuh>
+#include <cuda/experimental/__async/rcvr_ref.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct __seq
+{
+  template <class _Rcvr, class _Sndr1, class _Sndr2>
+  struct __args
+  {
+    using __rcvr_t  = _Rcvr;
+    using __sndr1_t = _Sndr1;
+    using __sndr2_t = _Sndr2;
+  };
+
+  template <class _Zip>
+  struct __opstate
+  {
+    using operation_state_concept = operation_state_t;
+
+    using __args_t  = __unzip<_Zip>; // __unzip<_Zip> is __args<_Rcvr, _Sndr1, _Sndr2>
+    using __rcvr_t  = typename __args_t::__rcvr_t;
+    using __sndr1_t = typename __args_t::__sndr1_t;
+    using __sndr2_t = typename __args_t::__sndr2_t;
+
+    using completion_signatures = //
+      transform_completion_signatures_of< //
+        __sndr1_t,
+        __opstate*,
+        completion_signatures_of_t<__sndr2_t, __rcvr_ref_t<__rcvr_t&>>,
+        __malways<__async::completion_signatures<>>::__f>; // swallow the first sender's value completions
+
+    _CCCL_HOST_DEVICE friend env_of_t<__rcvr_t> get_env(const __opstate* __self) noexcept
+    {
+      return __async::get_env(__self->__rcvr_);
+    }
+
+    __rcvr_t __rcvr_;
+    connect_result_t<__sndr1_t, __opstate*> __opstate1_;
+    connect_result_t<__sndr2_t, __rcvr_ref_t<__rcvr_t&>> __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate(__sndr1_t&& __sndr1, __sndr2_t&& __sndr2, __rcvr_t&& __rcvr)
+        : __rcvr_(static_cast<__rcvr_t&&>(__rcvr))
+        , __opstate1_(__async::connect(static_cast<__sndr1_t&&>(__sndr1), this))
+        , __opstate2_(__async::connect(static_cast<__sndr2_t&&>(__sndr2), __rcvr_ref(__rcvr_)))
+    {}
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    template <class... _Values>
+    _CCCL_HOST_DEVICE void set_value(_Values&&...) && noexcept
+    {
+      __async::start(__opstate2_);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) && noexcept
+    {
+      __async::set_error(static_cast<__rcvr_t&&>(__rcvr_), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() && noexcept
+    {
+      __async::set_stopped(static_cast<__rcvr_t&&>(__rcvr_));
+    }
+  };
+
+  template <class _Sndr1, class _Sndr2>
+  struct __sndr_t;
+
+  template <class _Sndr1, class _Sndr2>
+  _CCCL_HOST_DEVICE auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const -> __sndr_t<_Sndr1, _Sndr2>;
+};
+
+template <class _Sndr1, class _Sndr2>
+struct __seq::__sndr_t
+{
+  using sender_concept = sender_t;
+  using __sndr1_t      = _Sndr1;
+  using __sndr2_t      = _Sndr2;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) &&
+  {
+    using __opstate_t = __opstate<__zip<__args<_Rcvr, _Sndr1, _Sndr2>>>;
+    return __opstate_t{static_cast<_Sndr1&&>(__sndr1_), static_cast<_Sndr2>(__sndr2_), static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const&
+  {
+    using __opstate_t = __opstate<__zip<__args<_Rcvr, const _Sndr1&, const _Sndr2&>>>;
+    return __opstate_t{__sndr1_, __sndr2_, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr2> get_env() const noexcept
+  {
+    return __async::get_env(__sndr2_);
+  }
+
+  _CCCL_NO_UNIQUE_ADDRESS __seq __tag_;
+  _CCCL_NO_UNIQUE_ADDRESS __ignore __ign_;
+  __sndr1_t __sndr1_;
+  __sndr2_t __sndr2_;
+};
+
+template <class _Sndr1, class _Sndr2>
+_CCCL_HOST_DEVICE auto __seq::operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const -> __sndr_t<_Sndr1, _Sndr2>
+{
+  return __sndr_t<_Sndr1, _Sndr2>{{}, {}, static_cast<_Sndr1&&>(__sndr1), static_cast<_Sndr2&&>(__sndr2)};
+}
+
+using sequence_t = __seq;
+_CCCL_GLOBAL_CONSTANT sequence_t sequence{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/start_detached.cuh b/cudax/include/cuda/experimental/__async/start_detached.cuh
new file mode 100644
index 0000000000..e60c98d4a8
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/start_detached.cuh
@@ -0,0 +1,104 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_START_DETACHED
+#define __CUDAX_ASYNC_DETAIL_START_DETACHED
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__exception/terminate.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct start_detached_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  struct __opstate_base_t : __immovable
+  {};
+
+  struct __rcvr_t
+  {
+    using receiver_concept = receiver_t;
+
+    __opstate_base_t* __opstate_;
+    void (*__destroy)(__opstate_base_t*) noexcept;
+
+    template <class... _As>
+    void set_value(_As&&...) && noexcept
+    {
+      __destroy(__opstate_);
+    }
+
+    template <class _Error>
+    void set_error(_Error&&) && noexcept
+    {
+      ::cuda::std::terminate();
+    }
+
+    void set_stopped() && noexcept
+    {
+      __destroy(__opstate_);
+    }
+  };
+
+  template <class _Sndr>
+  struct __opstate_t : __opstate_base_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __async::completion_signatures_of_t<_Sndr, __rcvr_t>;
+    connect_result_t<_Sndr, __rcvr_t> __opstate_;
+
+    static void __destroy(__opstate_base_t* __ptr) noexcept
+    {
+      delete static_cast<__opstate_t*>(__ptr);
+    }
+
+    _CCCL_HOST_DEVICE explicit __opstate_t(_Sndr&& __sndr)
+        : __opstate_(__async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{this, &__destroy}))
+    {}
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      __async::start(__opstate_);
+    }
+  };
+
+public:
+  /// @brief Eagerly connects and starts a sender and lets it
+  /// run detached.
+  template <class _Sndr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void operator()(_Sndr __sndr) const
+  {
+    __async::start(*new __opstate_t<_Sndr>{static_cast<_Sndr&&>(__sndr)});
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT start_detached_t start_detached{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/start_on.cuh b/cudax/include/cuda/experimental/__async/start_on.cuh
new file mode 100644
index 0000000000..d3245817a1
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/start_on.cuh
@@ -0,0 +1,150 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_START_ON
+#define __CUDAX_ASYNC_DETAIL_START_ON
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/rcvr_with_env.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Sch>
+struct __sch_env_t
+{
+  _Sch __sch_;
+
+  _Sch __query(get_scheduler_t) const noexcept
+  {
+    return __sch_;
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct start_on_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+
+  template <class _Rcvr, class _Sch, class _CvSndr>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
+    {
+      return __async::get_env(__self->__env_rcvr_.__rcvr());
+    }
+
+    using operation_state_concept = operation_state_t;
+
+    using completion_signatures = //
+      transform_completion_signatures<
+        completion_signatures_of_t<_CvSndr, __rcvr_with_env_t<_Rcvr, __sch_env_t<_Sch>>*>,
+        transform_completion_signatures<completion_signatures_of_t<schedule_result_t<_Sch>, __opstate_t*>,
+                                        __async::completion_signatures<>,
+                                        __malways<__async::completion_signatures<>>::__f>>;
+
+    __rcvr_with_env_t<_Rcvr, __sch_env_t<_Sch>> __env_rcvr_;
+    connect_result_t<schedule_result_t<_Sch>, __opstate_t*> __opstate1_;
+    connect_result_t<_CvSndr, __rcvr_with_env_t<_Rcvr, __sch_env_t<_Sch>>*> __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_Sch __sch, _Rcvr __rcvr, _CvSndr&& __sndr)
+        : __env_rcvr_{static_cast<_Rcvr&&>(__rcvr), {__sch}}
+        , __opstate1_{connect(schedule(__env_rcvr_.__env_.__sch_), this)}
+        , __opstate2_{connect(static_cast<_CvSndr&&>(__sndr), &__env_rcvr_)}
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    _CCCL_HOST_DEVICE void set_value() noexcept
+    {
+      __async::start(__opstate2_);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __async::set_error(static_cast<_Rcvr&&>(__env_rcvr_.__rcvr()), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __async::set_stopped(static_cast<_Rcvr&&>(__env_rcvr_.__rcvr()));
+    }
+  };
+
+  template <class _Sch, class _Sndr>
+  struct __sndr_t;
+
+public:
+  template <class _Sch, class _Sndr>
+  _CCCL_HOST_DEVICE auto operator()(_Sch __sch, _Sndr __sndr) const noexcept //
+    -> __sndr_t<_Sch, _Sndr>;
+} start_on{};
+
+template <class _Sch, class _Sndr>
+struct start_on_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS start_on_t __tag_;
+  _Sch __sch_;
+  _Sndr __sndr_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate_t<_Rcvr, _Sch, _Sndr>
+  {
+    return __opstate_t<_Rcvr, _Sch, _Sndr>{__sch_, static_cast<_Rcvr&&>(__rcvr), static_cast<_Sndr&&>(__sndr_)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& -> __opstate_t<_Rcvr, _Sch, const _Sndr&>
+  {
+    return __opstate_t<_Rcvr, _Sch, const _Sndr&>{__sch_, static_cast<_Rcvr&&>(__rcvr), __sndr_};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+  {
+    return __async::get_env(__sndr_);
+  }
+};
+
+template <class _Sch, class _Sndr>
+_CCCL_HOST_DEVICE auto
+start_on_t::operator()(_Sch __sch, _Sndr __sndr) const noexcept -> start_on_t::__sndr_t<_Sch, _Sndr>
+{
+  return __sndr_t<_Sch, _Sndr>{{}, __sch, __sndr};
+}
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/stop_token.cuh b/cudax/include/cuda/experimental/__async/stop_token.cuh
new file mode 100644
index 0000000000..2a3e93a8d7
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/stop_token.cuh
@@ -0,0 +1,488 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_STOP_TOKEN
+#define __CUDAX_ASYNC_DETAIL_STOP_TOKEN
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_nothrow_constructible.h>
+#include <cuda/std/atomic>
+#include <cuda/std/detail/libcxx/include/__threading_support>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/thread.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#if __has_include(<stop_token>) && __cpp_lib_jthread >= 201911
+#  include <stop_token>
+#endif
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+// warning #20012-D: __device__ annotation is ignored on a
+// function("inplace_stop_source") that is explicitly defaulted on its first
+// declaration
+_CCCL_NV_DIAG_SUPPRESS(20012)
+
+namespace cuda::experimental::__async
+{
+// [stoptoken.inplace], class inplace_stop_token
+class inplace_stop_token;
+
+// [stopsource.inplace], class inplace_stop_source
+class inplace_stop_source;
+
+// [stopcallback.inplace], class template inplace_stop_callback
+template <class _Callback>
+class inplace_stop_callback;
+
+namespace __stok
+{
+struct __inplace_stop_callback_base
+{
+  _CCCL_HOST_DEVICE void __execute() noexcept
+  {
+    this->__execute_fn_(this);
+  }
+
+protected:
+  using __execute_fn_t = void(__inplace_stop_callback_base*) noexcept;
+
+  _CCCL_HOST_DEVICE explicit __inplace_stop_callback_base( //
+    const inplace_stop_source* __source, //
+    __execute_fn_t* __execute) noexcept
+      : __source_(__source)
+      , __execute_fn_(__execute)
+  {}
+
+  _CCCL_HOST_DEVICE void __register_callback() noexcept;
+
+  friend inplace_stop_source;
+
+  const inplace_stop_source* __source_;
+  __execute_fn_t* __execute_fn_;
+  __inplace_stop_callback_base* __next_      = nullptr;
+  __inplace_stop_callback_base** __prev_ptr_ = nullptr;
+  bool* __removed_during_callback_           = nullptr;
+  _CUDA_VSTD::atomic<bool> __callback_completed_{false};
+};
+
+struct __spin_wait
+{
+  __spin_wait() noexcept = default;
+
+  _CCCL_HOST_DEVICE void __wait() noexcept
+  {
+    if (__count_ == 0)
+    {
+      __async::__this_thread_yield();
+    }
+    else
+    {
+      --__count_;
+      _CUDA_VSTD::__libcpp_thread_yield_processor();
+    }
+  }
+
+private:
+  static constexpr uint32_t __yield_threshold = 20;
+  uint32_t __count_                           = __yield_threshold;
+};
+
+template <template <class> class>
+struct __check_type_alias_exists;
+} // namespace __stok
+
+// [stoptoken.never], class never_stop_token
+struct never_stop_token
+{
+private:
+  struct __callback_type
+  {
+    _CCCL_HOST_DEVICE explicit __callback_type(never_stop_token, __ignore) noexcept {}
+  };
+
+public:
+  template <class>
+  using callback_type = __callback_type;
+
+  _CCCL_HOST_DEVICE static constexpr auto stop_requested() noexcept -> bool
+  {
+    return false;
+  }
+
+  _CCCL_HOST_DEVICE static constexpr auto stop_possible() noexcept -> bool
+  {
+    return false;
+  }
+
+  _CCCL_HOST_DEVICE friend constexpr bool operator==(const never_stop_token&, const never_stop_token&) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend constexpr bool operator!=(const never_stop_token&, const never_stop_token&) noexcept
+  {
+    return false;
+  }
+};
+
+template <class _Callback>
+class inplace_stop_callback;
+
+// [stopsource.inplace], class inplace_stop_source
+class inplace_stop_source
+{
+public:
+  _CCCL_HOST_DEVICE inplace_stop_source() noexcept = default;
+  _CCCL_HOST_DEVICE ~inplace_stop_source();
+  _CUDAX_IMMOVABLE(inplace_stop_source);
+
+  _CCCL_HOST_DEVICE auto get_token() const noexcept -> inplace_stop_token;
+
+  _CCCL_HOST_DEVICE auto request_stop() noexcept -> bool;
+
+  _CCCL_HOST_DEVICE auto stop_requested() const noexcept -> bool
+  {
+    return (__state_.load(_CUDA_VSTD::memory_order_acquire) & __stop_requested_flag) != 0;
+  }
+
+private:
+  friend inplace_stop_token;
+  friend __stok::__inplace_stop_callback_base;
+  template <class>
+  friend class inplace_stop_callback;
+
+  _CCCL_HOST_DEVICE auto __lock() const noexcept -> uint8_t;
+  _CCCL_HOST_DEVICE void __unlock(uint8_t) const noexcept;
+
+  _CCCL_HOST_DEVICE auto __try_lock_unless_stop_requested(bool) const noexcept -> bool;
+
+  _CCCL_HOST_DEVICE auto __try_add_callback(__stok::__inplace_stop_callback_base*) const noexcept -> bool;
+
+  _CCCL_HOST_DEVICE void __remove_callback(__stok::__inplace_stop_callback_base*) const noexcept;
+
+  static constexpr uint8_t __stop_requested_flag = 1;
+  static constexpr uint8_t __locked_flag         = 2;
+
+  mutable _CUDA_VSTD::atomic<uint8_t> __state_{0};
+  mutable __stok::__inplace_stop_callback_base* __callbacks_ = nullptr;
+  __async::__thread_id __notifying_thread_;
+};
+
+// [stoptoken.inplace], class inplace_stop_token
+class inplace_stop_token
+{
+public:
+  template <class _Fun>
+  using callback_type = inplace_stop_callback<_Fun>;
+
+  _CCCL_HOST_DEVICE inplace_stop_token() noexcept
+      : __source_(nullptr)
+  {}
+
+  inplace_stop_token(const inplace_stop_token& __other) noexcept = default;
+
+  _CCCL_HOST_DEVICE inplace_stop_token(inplace_stop_token&& __other) noexcept
+      : __source_(__async::__exchange(__other.__source_, {}))
+  {}
+
+  auto operator=(const inplace_stop_token& __other) noexcept -> inplace_stop_token& = default;
+
+  _CCCL_HOST_DEVICE auto operator=(inplace_stop_token&& __other) noexcept -> inplace_stop_token&
+  {
+    __source_ = __async::__exchange(__other.__source_, nullptr);
+    return *this;
+  }
+
+  [[nodiscard]] _CCCL_HOST_DEVICE auto stop_requested() const noexcept -> bool
+  {
+    return __source_ != nullptr && __source_->stop_requested();
+  }
+
+  [[nodiscard]] _CCCL_HOST_DEVICE auto stop_possible() const noexcept -> bool
+  {
+    return __source_ != nullptr;
+  }
+
+  _CCCL_HOST_DEVICE void swap(inplace_stop_token& __other) noexcept
+  {
+    __async::__swap(__source_, __other.__source_);
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator==(const inplace_stop_token& __a, const inplace_stop_token& __b) noexcept
+  {
+    return __a.__source_ == __b.__source_;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const inplace_stop_token& __a, const inplace_stop_token& __b) noexcept
+  {
+    return __a.__source_ != __b.__source_;
+  }
+
+private:
+  friend inplace_stop_source;
+  template <class>
+  friend class inplace_stop_callback;
+
+  _CCCL_HOST_DEVICE explicit inplace_stop_token(const inplace_stop_source* __source) noexcept
+      : __source_(__source)
+  {}
+
+  const inplace_stop_source* __source_;
+};
+
+_CCCL_HOST_DEVICE inline auto inplace_stop_source::get_token() const noexcept -> inplace_stop_token
+{
+  return inplace_stop_token{this};
+}
+
+// [stopcallback.inplace], class template inplace_stop_callback
+template <class _Fun>
+class inplace_stop_callback : __stok::__inplace_stop_callback_base
+{
+public:
+  template <class _Fun2>
+  _CCCL_HOST_DEVICE explicit inplace_stop_callback(inplace_stop_token __token, _Fun2&& __fun) noexcept(
+    _CUDA_VSTD::is_nothrow_constructible_v<_Fun, _Fun2>)
+      : __stok::__inplace_stop_callback_base(__token.__source_, &inplace_stop_callback::__execute_impl)
+      , __fun(static_cast<_Fun2&&>(__fun))
+  {
+    __register_callback();
+  }
+
+  _CCCL_HOST_DEVICE ~inplace_stop_callback()
+  {
+    if (__source_ != nullptr)
+    {
+      __source_->__remove_callback(this);
+    }
+  }
+
+private:
+  _CCCL_HOST_DEVICE static void __execute_impl(__stok::__inplace_stop_callback_base* __cb) noexcept
+  {
+    static_cast<_Fun&&>(static_cast<inplace_stop_callback*>(__cb)->__fun)();
+  }
+
+  _CCCL_NO_UNIQUE_ADDRESS _Fun __fun;
+};
+
+namespace __stok
+{
+_CCCL_HOST_DEVICE inline void __inplace_stop_callback_base::__register_callback() noexcept
+{
+  if (__source_ != nullptr)
+  {
+    if (!__source_->__try_add_callback(this))
+    {
+      __source_ = nullptr;
+      // _Callback not registered because stop_requested() was true.
+      // Execute inline here.
+      __execute();
+    }
+  }
+}
+} // namespace __stok
+
+_CCCL_HOST_DEVICE inline inplace_stop_source::~inplace_stop_source()
+{
+  _LIBCUDACXX_ASSERT((__state_.load(_CUDA_VSTD::memory_order_relaxed) & __locked_flag) == 0, "");
+  _LIBCUDACXX_ASSERT(__callbacks_ == nullptr, "");
+}
+
+_CCCL_HOST_DEVICE inline auto inplace_stop_source::request_stop() noexcept -> bool
+{
+  if (!__try_lock_unless_stop_requested(true))
+  {
+    return true;
+  }
+
+  __notifying_thread_ = __async::__this_thread_id();
+
+  // We are responsible for executing callbacks.
+  while (__callbacks_ != nullptr)
+  {
+    auto* __callbk        = __callbacks_;
+    __callbk->__prev_ptr_ = nullptr;
+    __callbacks_          = __callbk->__next_;
+    if (__callbacks_ != nullptr)
+    {
+      __callbacks_->__prev_ptr_ = &__callbacks_;
+    }
+
+    __state_.store(__stop_requested_flag, _CUDA_VSTD::memory_order_release);
+
+    bool __removed_during_callback_      = false;
+    __callbk->__removed_during_callback_ = &__removed_during_callback_;
+
+    __callbk->__execute();
+
+    if (!__removed_during_callback_)
+    {
+      __callbk->__removed_during_callback_ = nullptr;
+      __callbk->__callback_completed_.store(true, _CUDA_VSTD::memory_order_release);
+    }
+
+    __lock();
+  }
+
+  __state_.store(__stop_requested_flag, _CUDA_VSTD::memory_order_release);
+  return false;
+}
+
+_CCCL_HOST_DEVICE inline auto inplace_stop_source::__lock() const noexcept -> uint8_t
+{
+  __stok::__spin_wait __spin;
+  auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+  do
+  {
+    while ((__old_state & __locked_flag) != 0)
+    {
+      __spin.__wait();
+      __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+    }
+  } while (!__state_.compare_exchange_weak(
+    __old_state, __old_state | __locked_flag, _CUDA_VSTD::memory_order_acquire, _CUDA_VSTD::memory_order_relaxed));
+
+  return __old_state;
+}
+
+_CCCL_HOST_DEVICE inline void inplace_stop_source::__unlock(uint8_t __old_state) const noexcept
+{
+  (void) __state_.store(__old_state, _CUDA_VSTD::memory_order_release);
+}
+
+_CCCL_HOST_DEVICE inline auto
+inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept -> bool
+{
+  __stok::__spin_wait __spin;
+  auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+  do
+  {
+    while (true)
+    {
+      if ((__old_state & __stop_requested_flag) != 0)
+      {
+        // Stop already requested.
+        return false;
+      }
+      else if (__old_state == 0)
+      {
+        break;
+      }
+      else
+      {
+        __spin.__wait();
+        __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+      }
+    }
+  } while (!__state_.compare_exchange_weak(
+    __old_state,
+    __set_stop_requested ? (__locked_flag | __stop_requested_flag) : __locked_flag,
+    _CUDA_VSTD::memory_order_acq_rel,
+    _CUDA_VSTD::memory_order_relaxed));
+
+  // Lock acquired successfully
+  return true;
+}
+
+_CCCL_HOST_DEVICE inline auto
+inplace_stop_source::__try_add_callback(__stok::__inplace_stop_callback_base* __callbk) const noexcept -> bool
+{
+  if (!__try_lock_unless_stop_requested(false))
+  {
+    return false;
+  }
+
+  __callbk->__next_     = __callbacks_;
+  __callbk->__prev_ptr_ = &__callbacks_;
+  if (__callbacks_ != nullptr)
+  {
+    __callbacks_->__prev_ptr_ = &__callbk->__next_;
+  }
+  __callbacks_ = __callbk;
+
+  __unlock(0);
+
+  return true;
+}
+
+_CCCL_HOST_DEVICE inline void
+inplace_stop_source::__remove_callback(__stok::__inplace_stop_callback_base* __callbk) const noexcept
+{
+  auto __old_state = __lock();
+
+  if (__callbk->__prev_ptr_ != nullptr)
+  {
+    // _Callback has not been executed yet.
+    // Remove from the list.
+    *__callbk->__prev_ptr_ = __callbk->__next_;
+    if (__callbk->__next_ != nullptr)
+    {
+      __callbk->__next_->__prev_ptr_ = __callbk->__prev_ptr_;
+    }
+    __unlock(__old_state);
+  }
+  else
+  {
+    auto __notifying_thread_ = this->__notifying_thread_;
+    __unlock(__old_state);
+
+    // _Callback has either already been executed or is
+    // currently executing on another thread.
+    if (__async::__this_thread_id() == __notifying_thread_)
+    {
+      if (__callbk->__removed_during_callback_ != nullptr)
+      {
+        *__callbk->__removed_during_callback_ = true;
+      }
+    }
+    else
+    {
+      // Concurrently executing on another thread.
+      // Wait until the other thread finishes executing the callback.
+      __stok::__spin_wait __spin;
+      while (!__callbk->__callback_completed_.load(_CUDA_VSTD::memory_order_acquire))
+      {
+        __spin.__wait();
+      }
+    }
+  }
+}
+
+struct __on_stop_request
+{
+  inplace_stop_source& __source_;
+
+  _CCCL_HOST_DEVICE void operator()() const noexcept
+  {
+    __source_.request_stop();
+  }
+};
+
+template <class _Token, class _Callback>
+using stop_callback_for_t = typename _Token::template callback_type<_Callback>;
+} // namespace cuda::experimental::__async
+
+_CCCL_NV_DIAG_DEFAULT(20012)
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/sync_wait.cuh b/cudax/include/cuda/experimental/__async/sync_wait.cuh
new file mode 100644
index 0000000000..f7191d792b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/sync_wait.cuh
@@ -0,0 +1,207 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_SYNC_WAIT
+#define __CUDAX_ASYNC_DETAIL_SYNC_WAIT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+// run_loop isn't supported on-device yet, so neither can sync_wait be.
+#if !defined(__CUDA_ARCH__)
+
+#  include <cuda/std/optional>
+#  include <cuda/std/tuple>
+
+#  include <cuda/experimental/__async/exception.cuh>
+#  include <cuda/experimental/__async/meta.cuh>
+#  include <cuda/experimental/__async/run_loop.cuh>
+#  include <cuda/experimental/__async/utility.cuh>
+
+#  include <system_error>
+
+#  include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+/// @brief Function object type for synchronously waiting for the result of a
+/// sender.
+struct sync_wait_t
+{
+#  if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#  endif // _CCCL_CUDA_COMPILER_NVCC
+  struct __env_t
+  {
+    run_loop* __loop_;
+
+    _CCCL_HOST_DEVICE auto query(get_scheduler_t) const noexcept
+    {
+      return __loop_->get_scheduler();
+    }
+
+    _CCCL_HOST_DEVICE auto query(get_delegatee_scheduler_t) const noexcept
+    {
+      return __loop_->get_scheduler();
+    }
+  };
+
+  template <class _Sndr>
+  struct __state_t
+  {
+    struct __rcvr_t
+    {
+      using receiver_concept = receiver_t;
+      __state_t* __state_;
+
+      template <class... _As>
+      _CCCL_HOST_DEVICE void set_value(_As&&... __as) noexcept
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __state_->__values_->emplace(static_cast<_As&&>(__as)...);
+          }), //
+          _CUDAX_CATCH(...)( //
+            { //
+              __state_->__eptr_ = ::std::current_exception();
+            }))
+        __state_->__loop_.finish();
+      }
+
+      template <class _Error>
+      _CCCL_HOST_DEVICE void set_error(_Error __err) noexcept
+      {
+        if constexpr (_CUDA_VSTD::is_same_v<_Error, ::std::exception_ptr>)
+        {
+          __state_->__eptr_ = static_cast<_Error&&>(__err);
+        }
+        else if constexpr (_CUDA_VSTD::is_same_v<_Error, ::std::error_code>)
+        {
+          __state_->__eptr_ = ::std::make_exception_ptr(::std::system_error(__err));
+        }
+        else
+        {
+          __state_->__eptr_ = ::std::make_exception_ptr(static_cast<_Error&&>(__err));
+        }
+        __state_->__loop_.finish();
+      }
+
+      _CCCL_HOST_DEVICE void set_stopped() noexcept
+      {
+        __state_->__loop_.finish();
+      }
+
+      __env_t get_env() const noexcept
+      {
+        return __env_t{&__state_->__loop_};
+      }
+    };
+
+    using __values_t = value_types_of_t<_Sndr, __rcvr_t, _CUDA_VSTD::tuple, __midentity::__f>;
+
+    _CUDA_VSTD::optional<__values_t>* __values_;
+    ::std::exception_ptr __eptr_;
+    run_loop __loop_;
+  };
+
+  struct __invalid_sync_wait
+  {
+    const __invalid_sync_wait& value() const
+    {
+      return *this;
+    }
+
+    const __invalid_sync_wait& operator*() const
+    {
+      return *this;
+    }
+
+    int __i_;
+  };
+
+public:
+  // clang-format off
+    /// @brief Synchronously wait for the result of a sender, blocking the
+    ///         current thread.
+    ///
+    /// `sync_wait` connects and starts the given sender, and then drives a
+    ///         `run_loop` instance until the sender completes. Additional work
+    ///         can be delegated to the `run_loop` by scheduling work on the
+    ///         scheduler returned by calling `get_delegatee_scheduler` on the
+    ///         receiver's environment.
+    ///
+    /// @pre The sender must have a exactly one value completion signature. That
+    ///         is, it can only complete successfully in one way, with a single
+    ///         set of values.
+    ///
+    /// @retval success Returns an engaged `::std::optional` containing the result
+    ///         values in a `::std::tuple`.
+    /// @retval canceled Returns an empty `::std::optional`.
+    /// @retval error Throws the error.
+    ///
+    /// @throws ::std::rethrow_exception(error) if the error has type
+    ///         `::std::exception_ptr`.
+    /// @throws ::std::system_error(error) if the error has type
+    ///         `::std::error_code`.
+    /// @throws error otherwise
+  // clang-format on
+  template <class _Sndr>
+  auto operator()(_Sndr&& __sndr) const
+  {
+    using __rcvr_t      = typename __state_t<_Sndr>::__rcvr_t;
+    using __values_t    = typename __state_t<_Sndr>::__values_t;
+    using __completions = completion_signatures_of_t<_Sndr, __rcvr_t>;
+    static_assert(__is_completion_signatures<__completions>);
+
+    if constexpr (!__is_completion_signatures<__completions>)
+    {
+      return __invalid_sync_wait{0};
+    }
+    else
+    {
+      _CUDA_VSTD::optional<__values_t> __result{};
+      __state_t<_Sndr> __state{&__result};
+
+      // Launch the sender with a continuation that will fill in a variant
+      auto __opstate = __async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{&__state});
+      __async::start(__opstate);
+
+      // Wait for the variant to be filled in, and process any work that
+      // may be delegated to this thread.
+      __state.__loop_.run();
+
+      if (__state.__eptr_)
+      {
+        ::std::rethrow_exception(__state.__eptr_);
+      }
+
+      return __result; // uses NRVO to "return" the result
+    }
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT sync_wait_t sync_wait{};
+} // namespace cuda::experimental::__async
+
+#  include <cuda/experimental/__async/epilogue.cuh>
+
+#endif // !defined(__CUDA_ARCH__)
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/then.cuh b/cudax/include/cuda/experimental/__async/then.cuh
new file mode 100644
index 0000000000..059f24e8ab
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/then.cuh
@@ -0,0 +1,303 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_THEN
+#define __CUDAX_ASYNC_DETAIL_THEN
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Forward-declate the then and upon_* algorithm tag types:
+struct then_t;
+struct upon_error_t;
+struct upon_stopped_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __upon_tag;
+template <class _Void>
+extern __fn_t<then_t>* __upon_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<upon_error_t>* __upon_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<upon_stopped_t>* __upon_tag<__stopped, _Void>;
+} // namespace __detail
+
+namespace __upon
+{
+template <bool IsVoid, bool _Nothrow>
+struct __completion_fn
+{ // non-void, potentially throwing case
+  template <class _Result>
+  using __f = completion_signatures<set_value_t(_Result), set_error_t(::std::exception_ptr)>;
+};
+
+template <>
+struct __completion_fn<true, false>
+{ // void, potentially throwing case
+  template <class>
+  using __f = completion_signatures<set_value_t(), set_error_t(::std::exception_ptr)>;
+};
+
+template <>
+struct __completion_fn<false, true>
+{ // non-void, non-throwing case
+  template <class _Result>
+  using __f = completion_signatures<set_value_t(_Result)>;
+};
+
+template <>
+struct __completion_fn<true, true>
+{ // void, non-throwing case
+  template <class>
+  using __f = completion_signatures<set_value_t()>;
+};
+
+template <class _Result, bool _Nothrow>
+using __completion_ = __minvoke1<__completion_fn<_CUDA_VSTD::is_same_v<_Result, void>, _Nothrow>, _Result>;
+
+template <class _Fn, class... _Ts>
+using __completion = __completion_<__call_result_t<_Fn, _Ts...>, __nothrow_callable<_Fn, _Ts...>>;
+} // namespace __upon
+
+template <__disposition_t _Disposition>
+struct __upon_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  using _UponTag = decltype(__detail::__upon_tag<_Disposition>());
+  using _SetTag  = decltype(__detail::__set_tag<_Disposition>());
+
+  template <class _Fn, class... _Ts>
+  using __error_not_callable = //
+    _ERROR< //
+      _WHERE(_IN_ALGORITHM, _UponTag),
+      _WHAT(_FUNCTION_IS_NOT_CALLABLE),
+      _WITH_FUNCTION(_Fn),
+      _WITH_ARGUMENTS(_Ts...)>;
+
+  template <class _Fn>
+  struct __transform_completion
+  {
+    template <class... _Ts>
+    using __f = __minvoke<__mtry_quote<__upon::__completion, __error_not_callable<_Fn, _Ts...>>, _Fn, _Ts...>;
+  };
+
+  template <class _CvSndr, class _Fn, class _Rcvr>
+  using __completions =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __transform_completion<_Fn>::template __f,
+                                   __default_completions,
+                                   __mtry_quote<__concat_completion_signatures>::__f>;
+
+  template <class _Rcvr, class _CvSndr, class _Fn>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
+    {
+      return __async::get_env(__self->__rcvr_);
+    }
+
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __completions<_CvSndr, _Fn, __opstate_t*>;
+
+    _Rcvr __rcvr_;
+    _Fn __fn_;
+    connect_result_t<_CvSndr, __opstate_t*> __opstate_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_CvSndr&& __sndr, _Rcvr __rcvr, _Fn __fn)
+        : __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+        , __fn_{static_cast<_Fn&&>(__fn)}
+        , __opstate_{__async::connect(static_cast<_CvSndr&&>(__sndr), this)}
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      __async::start(__opstate_);
+    }
+
+    template <bool _CanThrow = false, class... _Ts>
+    _CCCL_HOST_DEVICE void __set(_Ts&&... __ts) noexcept(!_CanThrow)
+    {
+      if constexpr (_CanThrow || __nothrow_callable<_Fn, _Ts...>)
+      {
+        if constexpr (_CUDA_VSTD::is_same_v<void, __call_result_t<_Fn, _Ts...>>)
+        {
+          static_cast<_Fn&&>(__fn_)(static_cast<_Ts&&>(__ts)...);
+          __async::set_value(static_cast<_Rcvr&&>(__rcvr_));
+        }
+        else
+        {
+          __async::set_value(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Fn&&>(__fn_)(static_cast<_Ts&&>(__ts)...));
+        }
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __set<true>(static_cast<_Ts&&>(__ts)...); //
+          }), //
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+    }
+
+    template <class _Tag, class... _Ts>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void __complete(_Tag, _Ts&&... __ts) noexcept
+    {
+      if constexpr (_CUDA_VSTD::is_same_v<_Tag, _SetTag>)
+      {
+        __set(static_cast<_Ts&&>(__ts)...);
+      }
+      else
+      {
+        _Tag()(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Ts&&>(__ts)...);
+      }
+    }
+
+    template <class... _Ts>
+    _CCCL_HOST_DEVICE void set_value(_Ts&&... __ts) noexcept
+    {
+      __complete(set_value_t(), static_cast<_Ts&&>(__ts)...);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __complete(set_error_t(), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __complete(set_stopped_t());
+    }
+  };
+
+  template <class _Fn, class _Sndr>
+  struct __sndr_t
+  {
+    using sender_concept = sender_t;
+    _CCCL_NO_UNIQUE_ADDRESS _UponTag __tag_;
+    _Fn __fn_;
+    _Sndr __sndr_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && //
+      noexcept(__nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Rcvr, _Fn>) //
+      -> __opstate_t<_Rcvr, _Sndr, _Fn>
+    {
+      return __opstate_t<_Rcvr, _Sndr, _Fn>{
+        static_cast<_Sndr&&>(__sndr_), static_cast<_Rcvr&&>(__rcvr), static_cast<_Fn&&>(__fn_)};
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& //
+      noexcept(__nothrow_constructible<__opstate_t<_Rcvr, const _Sndr&, _Fn>,
+                                       const _Sndr&,
+                                       _Rcvr,
+                                       const _Fn&>) //
+      -> __opstate_t<_Rcvr, const _Sndr&, _Fn>
+    {
+      return __opstate_t<_Rcvr, const _Sndr&, _Fn>{__sndr_, static_cast<_Rcvr&&>(__rcvr), __fn_};
+    }
+
+    _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+    {
+      return __async::get_env(__sndr_);
+    }
+  };
+
+  template <class _Fn>
+  struct __closure_t
+  {
+    using _UponTag = decltype(__detail::__upon_tag<_Disposition>());
+    _Fn __fn_;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr) -> __call_result_t<_UponTag, _Sndr, _Fn>
+    {
+      return _UponTag()(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__fn_));
+    }
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, __closure_t&& __self) //
+      -> __call_result_t<_UponTag, _Sndr, _Fn>
+    {
+      return _UponTag()(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__self.__fn_));
+    }
+  };
+
+public:
+  template <class _Sndr, class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Sndr __sndr, _Fn __fn) const noexcept //
+    -> __sndr_t<_Fn, _Sndr>
+  {
+    // If the incoming sender is non-dependent, we can check the completion
+    // signatures of the composed sender immediately.
+    if constexpr (__is_non_dependent_sender<_Sndr>)
+    {
+      using __completions = completion_signatures_of_t<__sndr_t<_Fn, _Sndr>>;
+      static_assert(__is_completion_signatures<__completions>);
+    }
+    return __sndr_t<_Fn, _Sndr>{{}, static_cast<_Fn&&>(__fn), static_cast<_Sndr&&>(__sndr)};
+  }
+
+  template <class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Fn __fn) const noexcept
+  {
+    return __closure_t<_Fn>{static_cast<_Fn&&>(__fn)};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct then_t : __upon_t<__value>
+{
+} then{};
+
+_CCCL_GLOBAL_CONSTANT struct upon_error_t : __upon_t<__error>
+{
+} upon_error{};
+
+_CCCL_GLOBAL_CONSTANT struct upon_stopped_t : __upon_t<__stopped>
+{
+} upon_stopped{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/thread.cuh b/cudax/include/cuda/experimental/__async/thread.cuh
new file mode 100644
index 0000000000..d048bdc34d
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/thread.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_THREAD
+#define __CUDAX_ASYNC_DETAIL_THREAD
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+#include <thread>
+
+#if defined(__CUDACC__)
+#  include <nv/target>
+#  define _CUDAX_FOR_HOST_OR_DEVICE(_FOR_HOST, _FOR_DEVICE) NV_IF_TARGET(NV_IS_HOST, _FOR_HOST, _FOR_DEVICE)
+#else
+#  define _CUDAX_FOR_HOST_OR_DEVICE(_FOR_HOST, _FOR_DEVICE) {_NV_EVAL _FOR_HOST}
+#endif
+
+namespace cuda::experimental::__async
+{
+#if defined(__CUDA_ARCH__)
+using __thread_id = int;
+#elif defined(_CCCL_COMPILER_NVHPC)
+struct __thread_id
+{
+  union
+  {
+    ::std::thread::id __host_;
+    int __device_;
+  };
+
+  _CCCL_HOST_DEVICE __thread_id() noexcept
+      : __host_()
+  {}
+  _CCCL_HOST_DEVICE __thread_id(::std::thread::id __host) noexcept
+      : __host_(__host)
+  {}
+  _CCCL_HOST_DEVICE __thread_id(int __device) noexcept
+      : __device_(__device)
+  {}
+
+  _CCCL_HOST_DEVICE friend bool operator==(const __thread_id& __self, const __thread_id& __other) noexcept
+  {
+    _CUDAX_FOR_HOST_OR_DEVICE((return __self.__host_ == __other.__host_;),
+                              (return __self.__device_ == __other.__device_;))
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const __thread_id& __self, const __thread_id& __other) noexcept
+  {
+    return !(__self == __other);
+  }
+};
+#else
+using __thread_id = ::std::thread::id;
+#endif
+
+inline _CCCL_HOST_DEVICE __thread_id __this_thread_id() noexcept
+{
+  _CUDAX_FOR_HOST_OR_DEVICE((return ::std::this_thread::get_id();),
+                            (return static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);))
+}
+
+inline _CCCL_HOST_DEVICE void __this_thread_yield() noexcept
+{
+  _CUDAX_FOR_HOST_OR_DEVICE((::std::this_thread::yield();), (void();))
+}
+} // namespace cuda::experimental::__async
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/thread_context.cuh b/cudax/include/cuda/experimental/__async/thread_context.cuh
new file mode 100644
index 0000000000..1ca98a0417
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/thread_context.cuh
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_THREAD_CONTEXT
+#define __CUDAX_ASYNC_DETAIL_THREAD_CONTEXT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+#if !defined(__CUDA_ARCH__)
+
+#  include <cuda/experimental/__async/run_loop.cuh>
+
+#  include <thread>
+
+#  include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct thread_context
+{
+  thread_context() noexcept
+      : __thrd_{[this] {
+        __loop_.run();
+      }}
+  {}
+
+  ~thread_context() noexcept
+  {
+    join();
+  }
+
+  void join() noexcept
+  {
+    if (__thrd_.joinable())
+    {
+      __loop_.finish();
+      __thrd_.join();
+    }
+  }
+
+  auto get_scheduler()
+  {
+    return __loop_.get_scheduler();
+  }
+
+private:
+  run_loop __loop_;
+  ::std::thread __thrd_;
+};
+} // namespace cuda::experimental::__async
+
+#  include <cuda/experimental/__async/epilogue.cuh>
+
+#endif // !defined(__CUDA_ARCH__)
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/tuple.cuh b/cudax/include/cuda/experimental/__async/tuple.cuh
new file mode 100644
index 0000000000..7704728781
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/tuple.cuh
@@ -0,0 +1,104 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_TUPLE
+#define __CUDAX_ASYNC_DETAIL_TUPLE
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <size_t _Idx, class _Ty>
+struct __box
+{
+  // Too many compiler bugs with [[no_unique_address]] to use it here.
+  // E.g., https://github.com/llvm/llvm-project/issues/88077
+  // _CCCL_NO_UNIQUE_ADDRESS
+  _Ty __value_;
+};
+
+template <size_t _Idx, class _Ty>
+_CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto __cget(__box<_Idx, _Ty> const& __box) noexcept -> _Ty const&
+{
+  return __box.__value_;
+}
+
+template <class _Idx, class... _Ts>
+struct __tupl;
+
+template <size_t... _Idx, class... _Ts>
+struct __tupl<__mindices<_Idx...>, _Ts...> : __box<_Idx, _Ts>...
+{
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
+    noexcept(noexcept(static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...,
+                                               static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_...)))
+      -> decltype(static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...,
+                                           static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_...))
+  {
+    return static_cast<_Fn&&>(
+      __fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_...);
+  }
+
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __for_each(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
+    noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>>
+              && ...)) -> __mif<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)>
+  {
+    return (
+      static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_),
+      ...);
+  }
+};
+
+template <class... _Ts>
+_CCCL_HOST_DEVICE __tupl(_Ts...) //
+  -> __tupl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+
+template <class _Fn, class _Tupl, class... _Us>
+using __apply_result_t =
+  decltype(__declval<_Tupl>().__apply(__declval<_Fn>(), __declval<_Tupl>(), __declval<_Us>()...));
+
+#if defined(_CCCL_COMPILER_MSVC)
+template <class... _Ts>
+struct __mk_tuple_
+{
+  using __indices_t = __mmake_indices<sizeof...(_Ts)>;
+  using type        = __tupl<__indices_t, _Ts...>;
+};
+
+template <class... _Ts>
+using __tuple = __t<__mk_tuple_<_Ts...>>;
+#else
+template <class... _Ts>
+using __tuple = __tupl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+#endif
+
+template <class... _Ts>
+using __decayed_tuple = __tuple<__decay_t<_Ts>...>;
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/type_traits.cuh b/cudax/include/cuda/experimental/__async/type_traits.cuh
new file mode 100644
index 0000000000..2fcdb78fde
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/type_traits.cuh
@@ -0,0 +1,258 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_TYPE_TRAITS
+#define __CUDAX_ASYNC_DETAIL_TYPE_TRAITS
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/remove_reference.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+#if __has_builtin(__remove_reference)
+
+template <class _Ty>
+using __remove_ref_t = __remove_reference(_Ty);
+
+#elif __has_builtin(__remove_reference_t)
+
+template <class _Ty>
+using __remove_ref_t = __remove_reference_t(_Ty);
+
+#else
+
+template <class _Ty>
+using __remove_ref_t = _CUDA_VSTD::remove_reference_t<_Ty>;
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// __decay_t: An efficient implementation for ::std::decay
+#if __has_builtin(__decay)
+
+template <class _Ty>
+using __decay_t = __decay(_Ty);
+
+// #elif defined(_CCCL_COMPILER_NVHPC)
+
+//   template <class _Ty>
+//   using __decay_t = _CUDA_VSTD::decay_t<_Ty>;
+
+#else
+
+struct __decay_object
+{
+  template <class _Ty>
+  static _Ty __g(_Ty const&);
+  template <class _Ty>
+  using __f = decltype(__g(__declval<_Ty>()));
+};
+
+struct __decay_default
+{
+  template <class _Ty>
+  static _Ty __g(_Ty);
+  template <class _Ty>
+  using __f = decltype(__g(__declval<_Ty>()));
+};
+
+// I don't care to support abominable function types,
+// but if that's needed, this is the way to do it:
+// struct __decay_abominable {
+//   template <class _Ty>
+//   using __f = _Ty;
+// };
+
+struct __decay_void
+{
+  template <class _Ty>
+  using __f = void;
+};
+
+template <class _Ty>
+extern __decay_object __mdecay;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty(_Us...)>;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty(_Us...) noexcept>;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty (&)(_Us...)>;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty (&)(_Us...) noexcept>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const noexcept>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const &>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const & noexcept>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const &&>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const && noexcept>;
+
+template <class _Ty>
+extern __decay_default __mdecay<_Ty[]>;
+
+template <class _Ty, size_t _Ny>
+extern __decay_default __mdecay<_Ty[_Ny]>;
+
+template <class _Ty, size_t _Ny>
+extern __decay_default __mdecay<_Ty (&)[_Ny]>;
+
+template <>
+inline __decay_void __mdecay<void>;
+
+template <>
+inline __decay_void __mdecay<void const>;
+
+template <class _Ty>
+using __decay_t = typename decltype(__mdecay<_Ty>)::template __f<_Ty>;
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// __copy_cvref_t: For copying cvref from one type to another
+struct __cp
+{
+  template <class _Tp>
+  using __f = _Tp;
+};
+
+struct __cpc
+{
+  template <class _Tp>
+  using __f = const _Tp;
+};
+
+struct __cplr
+{
+  template <class _Tp>
+  using __f = _Tp&;
+};
+
+struct __cprr
+{
+  template <class _Tp>
+  using __f = _Tp&&;
+};
+
+struct __cpclr
+{
+  template <class _Tp>
+  using __f = const _Tp&;
+};
+
+struct __cpcrr
+{
+  template <class _Tp>
+  using __f = const _Tp&&;
+};
+
+template <class>
+extern __cp __cpcvr;
+template <class _Tp>
+extern __cpc __cpcvr<const _Tp>;
+template <class _Tp>
+extern __cplr __cpcvr<_Tp&>;
+template <class _Tp>
+extern __cprr __cpcvr<_Tp&&>;
+template <class _Tp>
+extern __cpclr __cpcvr<const _Tp&>;
+template <class _Tp>
+extern __cpcrr __cpcvr<const _Tp&&>;
+template <class _Tp>
+using __copy_cvref_fn = decltype(__cpcvr<_Tp>);
+
+template <class _From, class _To>
+using __copy_cvref_t = typename __copy_cvref_fn<_From>::template __f<_To>;
+
+template <class _Fn, class... _As>
+using __call_result_t = decltype(__declval<_Fn>()(__declval<_As>()...));
+
+template <class _Fn, class... _As>
+_CCCL_INLINE_VAR constexpr bool __callable = __mvalid_q<__call_result_t, _Fn, _As...>;
+
+#if defined(__CUDA_ARCH__)
+template <class _Fn, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_callable = true;
+
+template <class _Ty, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_constructible = true;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_decay_copyable = true;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_movable = true;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_copyable = true;
+#else
+template <class _Fn, class... _As>
+using __nothrow_callable_ = __mif<noexcept(__declval<_Fn>()(__declval<_As>()...))>;
+
+template <class _Fn, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_callable = __mvalid_q<__nothrow_callable_, _Fn, _As...>;
+
+template <class _Ty, class... _As>
+using __nothrow_constructible_ = __mif<noexcept(_Ty{__declval<_As>()...})>;
+
+template <class _Ty, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_constructible = __mvalid_q<__nothrow_constructible_, _Ty, _As...>;
+
+template <class _Ty>
+using __nothrow_decay_copyable_ = __mif<noexcept(__decay_t<_Ty>(__declval<_Ty>()))>;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_decay_copyable = (__mvalid_q<__nothrow_decay_copyable_, _As> && ...);
+
+template <class _Ty>
+using __nothrow_movable_ = __mif<noexcept(_Ty(__declval<_Ty>()))>;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_movable = (__mvalid_q<__nothrow_movable_, _As> && ...);
+
+template <class _Ty>
+using __nothrow_copyable_ = __mif<noexcept(_Ty(__declval<const _Ty&>()))>;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_copyable = (__mvalid_q<__nothrow_copyable_, _As> && ...);
+#endif
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/utility.cuh b/cudax/include/cuda/experimental/__async/utility.cuh
new file mode 100644
index 0000000000..4018610da9
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/utility.cuh
@@ -0,0 +1,208 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_UTILITY
+#define __CUDAX_ASYNC_DETAIL_UTILITY
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/initializer_list>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+_CCCL_GLOBAL_CONSTANT size_t __npos = ~0UL;
+
+struct __ignore
+{
+  template <class... _As>
+  _CCCL_HOST_DEVICE constexpr __ignore(_As&&...) noexcept {};
+};
+
+template <class...>
+struct __undefined;
+
+struct __empty
+{};
+
+struct [[deprecated]] __deprecated
+{};
+
+struct __nil
+{};
+
+struct __immovable
+{
+  __immovable() = default;
+  _CUDAX_IMMOVABLE(__immovable);
+};
+
+_CCCL_HOST_DEVICE constexpr size_t __maximum(_CUDA_VSTD::initializer_list<size_t> __il) noexcept
+{
+  size_t __max = 0;
+  for (auto i : __il)
+  {
+    if (i > __max)
+    {
+      __max = i;
+    }
+  }
+  return __max;
+}
+
+_CCCL_HOST_DEVICE constexpr size_t __find_pos(bool const* const __begin, bool const* const __end) noexcept
+{
+  for (bool const* __where = __begin; __where != __end; ++__where)
+  {
+    if (*__where)
+    {
+      return static_cast<size_t>(__where - __begin);
+    }
+  }
+  return __npos;
+}
+
+template <class _Ty, class... _Ts>
+_CCCL_HOST_DEVICE constexpr size_t __index_of() noexcept
+{
+  constexpr bool __same[] = {_CUDA_VSTD::is_same_v<_Ty, _Ts>...};
+  return __async::__find_pos(__same, __same + sizeof...(_Ts));
+}
+
+template <class _Ty, class _Uy = _Ty>
+_CCCL_HOST_DEVICE constexpr _Ty __exchange(_Ty& __obj, _Uy&& __new_value) noexcept
+{
+  constexpr bool __is_nothrow = //
+    noexcept(_Ty(static_cast<_Ty&&>(__obj))) && //
+    noexcept(__obj = static_cast<_Uy&&>(__new_value)); //
+  static_assert(__is_nothrow);
+
+  _Ty old_value = static_cast<_Ty&&>(__obj);
+  __obj         = static_cast<_Uy&&>(__new_value);
+  return old_value;
+}
+
+template <class _Ty>
+_CCCL_HOST_DEVICE constexpr void __swap(_Ty& __left, _Ty& __right) noexcept
+{
+  constexpr bool __is_nothrow = //
+    noexcept(_Ty(static_cast<_Ty&&>(__left))) && //
+    noexcept(__left = static_cast<_Ty&&>(__right)); //
+  static_assert(__is_nothrow);
+
+  _Ty __tmp = static_cast<_Ty&&>(__left);
+  __left    = static_cast<_Ty&&>(__right);
+  __right   = static_cast<_Ty&&>(__tmp);
+}
+
+template <class _Ty>
+_CCCL_HOST_DEVICE constexpr _Ty __decay_copy(_Ty&& __ty) noexcept(__nothrow_decay_copyable<_Ty>)
+{
+  return static_cast<_Ty&&>(__ty);
+}
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_GCC("-Wnon-template-friend")
+_CCCL_NV_DIAG_SUPPRESS(probable_guiding_friend)
+
+// __zip/__unzip is for keeping type names short. It has the unfortunate side
+// effect of obfuscating the types.
+namespace
+{
+template <size_t _Ny>
+struct __slot
+{
+  friend constexpr auto __slot_allocated(__slot<_Ny>);
+};
+
+template <class _Type, size_t _Ny>
+struct __allocate_slot
+{
+  static constexpr size_t __value = _Ny;
+
+  friend constexpr auto __slot_allocated(__slot<_Ny>)
+  {
+    return static_cast<_Type (*)()>(nullptr);
+  }
+};
+
+template <class _Type, size_t _Id = 0, size_t _Pow2 = 0>
+constexpr size_t __next(long);
+
+// If __slot_allocated(__slot<_Id>) has NOT been defined, then SFINAE will keep
+// this function out of the overload set...
+template <class _Type, //
+          size_t _Id   = 0,
+          size_t _Pow2 = 0,
+          bool         = !__slot_allocated(__slot<_Id + (1 << _Pow2) - 1>())>
+constexpr size_t __next(int)
+{
+  return __async::__next<_Type, _Id, _Pow2 + 1>(0);
+}
+
+template <class _Type, size_t _Id, size_t _Pow2>
+constexpr size_t __next(long)
+{
+  if constexpr (_Pow2 == 0)
+  {
+    return __allocate_slot<_Type, _Id>::__value;
+  }
+  else
+  {
+    return __async::__next<_Type, _Id + (1 << (_Pow2 - 1)), 0>(0);
+  }
+}
+
+// Prior to Clang 12, we can't use the __slot trick to erase long type names
+// because of a compiler bug. We'll just use the original type name in that case.
+#if defined(_CCCL_COMPILER_CLANG) && _CCCL_CLANG_VERSION < 120000
+
+template <class _Type>
+using __zip = _Type;
+
+template <class _Id>
+using __unzip = _Id;
+
+#else
+
+template <class _Type, size_t _Val = __async::__next<_Type>(0)>
+using __zip = __slot<_Val>;
+
+template <class _Id>
+using __unzip = decltype(__slot_allocated(_Id())());
+
+#endif
+
+// burn the first slot
+using __ignore_this_typedef [[maybe_unused]] = __zip<void>;
+} // namespace
+
+_CCCL_NV_DIAG_DEFAULT(probable_guiding_friend)
+_CCCL_DIAG_POP
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/variant.cuh b/cudax/include/cuda/experimental/__async/variant.cuh
new file mode 100644
index 0000000000..4f38e914cd
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/variant.cuh
@@ -0,0 +1,192 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_VARIANT
+#define __CUDAX_ASYNC_DETAIL_VARIANT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__new/launder.h>
+
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <new> // IWYU pragma: keep
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+/********************************************************************************/
+/* NB: The variant type implemented here default-constructs into the valueless  */
+/* state. This is different from std::variant which default-constructs into the */
+/* first alternative. This is done to simplify the implementation and to avoid  */
+/* the need for a default constructor for each alternative type.                */
+/********************************************************************************/
+
+template <class _Idx, class... _Ts>
+class __variant_impl;
+
+template <>
+class __variant_impl<__mindices<>>
+{
+public:
+  template <class _Fn, class... _Us>
+  _CCCL_HOST_DEVICE void __visit(_Fn&&, _Us&&...) const noexcept
+  {}
+};
+
+template <size_t... _Idx, class... _Ts>
+class __variant_impl<__mindices<_Idx...>, _Ts...>
+{
+  static constexpr size_t __max_size = __maximum({sizeof(_Ts)...});
+  static_assert(__max_size != 0);
+  size_t __index_{__npos};
+  alignas(_Ts...) unsigned char __storage_[__max_size];
+
+  template <size_t _Ny>
+  using __at = __m_at_c<_Ny, _Ts...>;
+
+  _CCCL_HOST_DEVICE void __destroy() noexcept
+  {
+    if (__index_ != __npos)
+    {
+      // make this local in case destroying the sub-object destroys *this
+      const auto index = __async::__exchange(__index_, __npos);
+      ((_Idx == index ? _CUDA_VSTD::destroy_at(static_cast<__at<_Idx>*>(__ptr())) : void(0)), ...);
+    }
+  }
+
+public:
+  _CUDAX_IMMOVABLE(__variant_impl);
+
+  _CCCL_HOST_DEVICE __variant_impl() noexcept {}
+
+  _CCCL_HOST_DEVICE ~__variant_impl()
+  {
+    __destroy();
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void* __ptr() noexcept
+  {
+    return __storage_;
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE size_t __index() const noexcept
+  {
+    return __index_;
+  }
+
+  template <class _Ty, class... _As>
+  _CCCL_HOST_DEVICE _Ty& __emplace(_As&&... __as) //
+    noexcept(__nothrow_constructible<_Ty, _As...>)
+  {
+    constexpr size_t __new_index = __async::__index_of<_Ty, _Ts...>();
+    static_assert(__new_index != __npos, "_Type not in variant");
+
+    __destroy();
+    _Ty* __value = ::new (__ptr()) _Ty{static_cast<_As&&>(__as)...};
+    __index_     = __new_index;
+    return *_CUDA_VSTD::launder(__value);
+  }
+
+  template <size_t _Ny, class... _As>
+  _CCCL_HOST_DEVICE __at<_Ny>& __emplace_at(_As&&... __as) //
+    noexcept(__nothrow_constructible<__at<_Ny>, _As...>)
+  {
+    static_assert(_Ny < sizeof...(_Ts), "variant index is too large");
+
+    __destroy();
+    __at<_Ny>* __value = ::new (__ptr()) __at<_Ny>{static_cast<_As&&>(__as)...};
+    __index_           = _Ny;
+    return *_CUDA_VSTD::launder(__value);
+  }
+
+  template <class _Fn, class... _As>
+  _CCCL_HOST_DEVICE auto __emplace_from(_Fn&& __fn, _As&&... __as) //
+    noexcept(__nothrow_callable<_Fn, _As...>) -> __call_result_t<_Fn, _As...>&
+  {
+    using __result_t             = __call_result_t<_Fn, _As...>;
+    constexpr size_t __new_index = __async::__index_of<__result_t, _Ts...>();
+    static_assert(__new_index != __npos, "_Type not in variant");
+
+    __destroy();
+    __result_t* __value = ::new (__ptr()) __result_t(static_cast<_Fn&&>(__fn)(static_cast<_As&&>(__as)...));
+    __index_            = __new_index;
+    return *_CUDA_VSTD::launder(__value);
+  }
+
+  template <class _Fn, class _Self, class... _As>
+  _CCCL_HOST_DEVICE static void __visit(_Fn&& __fn, _Self&& __self, _As&&... __as) //
+    noexcept((__nothrow_callable<_Fn, _As..., __copy_cvref_t<_Self, _Ts>> && ...))
+  {
+    // make this local in case destroying the sub-object destroys *this
+    const auto index = __self.__index_;
+    _LIBCUDACXX_ASSERT(index != __npos, "");
+    ((_Idx == index
+        ? static_cast<_Fn&&>(__fn)(static_cast<_As&&>(__as)..., static_cast<_Self&&>(__self).template __get<_Idx>())
+        : void()),
+     ...);
+  }
+
+  template <size_t _Ny>
+  _CCCL_HOST_DEVICE __at<_Ny>&& __get() && noexcept
+  {
+    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    return static_cast<__at<_Ny>&&>(*static_cast<__at<_Ny>*>(__ptr()));
+  }
+
+  template <size_t _Ny>
+  _CCCL_HOST_DEVICE __at<_Ny>& __get() & noexcept
+  {
+    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    return *static_cast<__at<_Ny>*>(__ptr());
+  }
+
+  template <size_t _Ny>
+  _CCCL_HOST_DEVICE const __at<_Ny>& __get() const& noexcept
+  {
+    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    return *static_cast<const __at<_Ny>*>(__ptr());
+  }
+};
+
+#if defined(_CCCL_COMPILER_MSVC)
+template <class... _Ts>
+struct __mk_variant_
+{
+  using __indices_t = __mmake_indices<sizeof...(_Ts)>;
+  using type        = __variant_impl<__indices_t, _Ts...>;
+};
+
+template <class... _Ts>
+using __variant = __t<__mk_variant_<_Ts...>>;
+#else
+template <class... _Ts>
+using __variant = __variant_impl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+#endif
+
+template <class... _Ts>
+using __decayed_variant = __variant<__decay_t<_Ts>...>;
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/when_all.cuh b/cudax/include/cuda/experimental/__async/when_all.cuh
new file mode 100644
index 0000000000..8cb8d621d0
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/when_all.cuh
@@ -0,0 +1,650 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_WHEN_ALL
+#define __CUDAX_ASYNC_DETAIL_WHEN_ALL
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/atomic>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/lazy.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/stop_token.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+_CCCL_DIAG_PUSH
+_CCCL_NV_DIAG_SUPPRESS(expr_has_no_effect)
+_CCCL_DIAG_SUPPRESS_GCC("-Wunused-__value")
+
+namespace cuda::experimental::__async
+{
+// Forward declare the when_all tag type:
+struct when_all_t;
+
+// Some mechanics for computing a when_all sender's completion signatures:
+namespace __when_all
+{
+template <class>
+struct __env_t;
+
+template <class, size_t>
+struct __rcvr_t;
+
+template <class, class, class>
+struct __opstate_t;
+
+using __tombstone = _ERROR<_WHERE(_IN_ALGORITHM, when_all_t), _WHAT(_SENDER_HAS_TOO_MANY_SUCCESS_COMPLETIONS)>;
+
+// Use this to short-circuit the computation of whether all values and
+// errors are nothrow decay-copyable.
+template <class _Bool>
+struct __all_nothrow_decay_copyable
+{
+  static_assert(_CUDA_VSTD::is_same_v<_Bool, __mtrue>);
+  template <class... _Ts>
+  using __f = __mbool<__nothrow_decay_copyable<_Ts...>>;
+};
+
+template <>
+struct __all_nothrow_decay_copyable<__mfalse>
+{
+  template <class... _Ts>
+  using __f = __mfalse;
+};
+
+//////////////////////////////////////////////////////////////////////////////////////
+// This type is used to compute the completion signatures contributed by one of
+// when_all's child senders. It tracks the completions, whether decay-copying the
+// values and errors can throw, and also which of the when_all's value result
+// datums this sender is responsible for setting.
+//
+// Leave this undefined:
+template <class _NothrowVals, class _NothrowErrors, class _Offsets, class... _Sigs>
+struct __completion_metadata;
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Convert the metadata type into a completion signatures type by adding a
+// set_error_t(exception_ptr) completion if decay-copying any of the values
+// or errors is possibly throwing, and then removing duplicate completion
+// signatures.
+template <class>
+struct __reduce_completions;
+
+template <class... _What>
+struct __reduce_completions<_ERROR<_What...>&>
+{
+  using type = __mpair<_ERROR<_What...>, __moffsets<>>;
+};
+
+template <class _ValsOK, class _ErrsOK, class _Offsets, class... _Sigs>
+struct __reduce_completions<__completion_metadata<_ValsOK, _ErrsOK, _Offsets, _Sigs...>&>
+{
+  using type = __mpair< //
+    __concat_completion_signatures<completion_signatures<_Sigs..., set_error_t(::std::exception_ptr)>>,
+    _Offsets>;
+};
+
+template <class _Offsets, class... _Sigs>
+struct __reduce_completions<__completion_metadata<__mtrue, __mtrue, _Offsets, _Sigs...>&>
+{
+  using type = __mpair<__concat_completion_signatures<completion_signatures<_Sigs...>>, _Offsets>;
+};
+
+template <class _Ty>
+using __reduce_completions_t = __t<__reduce_completions<_Ty>>;
+
+//////////////////////////////////////////////////////////////////////////////////////
+// __append_completion
+//
+// We use a set of partial specialization of the __append_completion variable
+// template to append the metadata from a single completion signature into a
+// metadata struct, and we use a fold expression to append all _Ny completion
+// signatures.
+template <class _Metadata, class _Sig>
+extern __undefined<_Metadata> __append_completion;
+
+template <class _ValsOK, class _ErrsOK, class... _Sigs, class _Tag, class... _As>
+extern __completion_metadata<_ValsOK,
+                             __minvoke<__all_nothrow_decay_copyable<_ErrsOK>, _As...>,
+                             __moffsets<>,
+                             _Sigs...,
+                             _Tag(__decay_t<_As>...)>&
+  __append_completion<__completion_metadata<_ValsOK, _ErrsOK, __moffsets<>, _Sigs...>, _Tag(_As...)>;
+
+// This overload is selected when we see the first set_value_t completion
+// signature.
+template <class _ValsOK, class _ErrsOK, class... _Sigs, class... _As>
+extern __completion_metadata<
+  __minvoke<__all_nothrow_decay_copyable<_ValsOK>, _As...>,
+  _ErrsOK,
+  __moffsets<>,
+  set_value_t(__decay_t<_As>...), // Insert the value signature at the front
+  _Sigs...>& __append_completion<__completion_metadata<_ValsOK, _ErrsOK, __moffsets<>, _Sigs...>, set_value_t(_As...)>;
+
+// This overload is selected when we see the second set_value_t completion
+// signature. Senders passed to when_all are only allowed one set_value
+// completion.
+template <class _ValsOK, class _ErrsOK, class... _Sigs, class... _As, class... _Bs>
+extern __tombstone&
+  __append_completion<__completion_metadata<_ValsOK, _ErrsOK, __moffsets<>, set_value_t(_As...), _Sigs...>,
+                      set_value_t(_Bs...)>;
+
+// This overload is selected when we see the second set_value_t completion
+// signature. Senders passed to when_all are only allowed one set_value
+// completion.
+template <class _Sig>
+extern __tombstone& __append_completion<__tombstone&, _Sig>;
+
+// We use a fold expression over the bitwise OR operator to append all of the
+// completion signatures from one child sender into a metadata struct.
+template <class _Metadata, class _Sig>
+auto operator|(_Metadata&, _Sig*) -> decltype(__append_completion<_Metadata, _Sig>);
+
+// The initial value of the fold expression:
+using __inner_fold_init = __completion_metadata<__mtrue, __mtrue, __moffsets<>>;
+
+template <class... _Sigs>
+using __collect_inner = //
+  decltype((__declval<__inner_fold_init&>() | ... | static_cast<_Sigs*>(nullptr)));
+
+//////////////////////////////////////////////////////////////////////////////////////
+// __merge_metadata
+//
+// After computing a metadata struct for each child sender, all the metadata
+// structs must be merged. We use a set of partial specialization of the
+// __merge_metadata variable template to merge two metadata structs into one,
+// and we use a fold expression to merge all _Ny into one.
+template <class _Meta1, class _Meta2>
+extern __undefined<_Meta1> __merge_metadata;
+
+// This specialization causes an error to be propagated.
+template <class _ValsOK, class _ErrsOK, class _Offsets, class... _LeftSigs, class... _What>
+extern _ERROR<_What...>&
+  __merge_metadata<__completion_metadata<_ValsOK, _ErrsOK, _Offsets, _LeftSigs...>, _ERROR<_What...>>;
+
+// This overload is selected with the left and right metadata are both for senders
+// that have no set_value completion signature.
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          class _Offsets,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _RightSigs>
+extern __completion_metadata<__mtrue, __mand<_LeftErrsOK, _RightErrsOK>, __moffsets<>, _LeftSigs..., _RightSigs...>&
+  __merge_metadata<__completion_metadata<_LeftValsOK, _LeftErrsOK, _Offsets, _LeftSigs...>,
+                   __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, _RightSigs...>>;
+
+// The following two specializations are selected when one of the metadata
+// structs is for a sender with no value completions. In that case, the
+// when_all can never complete successfully, so drop the other set_value
+// completion signature.
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          class _Offsets,
+          class... _As,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _RightSigs>
+extern __completion_metadata<__mtrue, // There will be no value completion, so values need not be copied.
+                             __mand<_LeftErrsOK, _RightErrsOK>,
+                             __moffsets<>,
+                             _LeftSigs...,
+                             _RightSigs...>&
+  __merge_metadata<__completion_metadata<_LeftValsOK, _LeftErrsOK, _Offsets, set_value_t(_As...), _LeftSigs...>,
+                   __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, _RightSigs...>>;
+
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          class _Offsets,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _As,
+          class... _RightSigs>
+extern __completion_metadata<__mtrue, // There will be no value completion, so values need not be copied.
+                             __mand<_LeftErrsOK, _RightErrsOK>,
+                             __moffsets<>,
+                             _LeftSigs...,
+                             _RightSigs...>&
+  __merge_metadata<__completion_metadata<_LeftValsOK, _LeftErrsOK, _Offsets, _LeftSigs...>,
+                   __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, set_value_t(_As...), _RightSigs...>>;
+
+template <size_t... _Offsets>
+_CCCL_INLINE_VAR constexpr size_t __last_offset = (0, ..., _Offsets);
+
+template <size_t _Count, size_t... _Offsets>
+using __append_offset = __moffsets<_Offsets..., _Count + __last_offset<_Offsets...>>;
+
+// This overload is selected when both metadata structs are for senders with
+// a single value completion. Concatenate the value types.
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          size_t... _Offsets,
+          class... _As,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _Bs,
+          class... _RightSigs>
+extern __completion_metadata<__mand<_LeftValsOK, _RightValsOK>,
+                             __mand<_LeftErrsOK, _RightErrsOK>,
+                             __append_offset<sizeof...(_Bs), _Offsets...>,
+                             set_value_t(_As..., _Bs...), // Concatenate the value types.
+                             _LeftSigs...,
+                             _RightSigs...>&
+  __merge_metadata<
+    __completion_metadata<_LeftValsOK, _LeftErrsOK, __moffsets<_Offsets...>, set_value_t(_As...), _LeftSigs...>,
+    __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, set_value_t(_Bs...), _RightSigs...>>;
+
+template <class... _What, class _Other>
+extern _ERROR<_What...>& __merge_metadata<_ERROR<_What...>, _Other>;
+
+// We use a fold expression over the bitwise AND operator to merge all the
+// completion metadata structs from the child senders into a single metadata
+// struct.
+template <class _Meta1, class _Meta2>
+auto operator&(_Meta1&, _Meta2&) -> decltype(__merge_metadata<_Meta1, _Meta2>);
+
+// The initial value for the fold.
+using __outer_fold_init = __completion_metadata<__mtrue, __mtrue, __moffsets<0ul>, set_value_t(), set_stopped_t()>;
+
+template <class... _Sigs>
+using __collect_outer = //
+  __reduce_completions_t<decltype((__declval<__outer_fold_init&>() & ... & __declval<_Sigs>()))>;
+
+// Extract the first template parameter of the __state_t specialization.
+// The first template parameter is the receiver type.
+template <class _State>
+using __rcvr_from_state_t = __mapply<__mpoly_q<__mfront>, _State>;
+
+/// The receivers connected to the when_all's sub-operations expose this as
+/// their environment. Its `get_stop_token` query returns the token from
+/// when_all's stop source. All other queries are forwarded to the outer
+/// receiver's environment.
+template <class _StateZip>
+struct __env_t
+{
+  using __state_t = __unzip<_StateZip>;
+  using __rcvr_t  = __rcvr_from_state_t<__state_t>;
+
+  __state_t& __state_;
+
+  _CCCL_HOST_DEVICE inplace_stop_token __query(get_stop_token_t) const noexcept
+  {
+    return __state_.__stop_token_;
+  }
+
+  template <class _Tag>
+  _CCCL_HOST_DEVICE auto query(_Tag) const noexcept -> __query_result_t<_Tag, env_of_t<__rcvr_t>>
+  {
+    return __async::get_env(__state_.__rcvr_).__query(_Tag());
+  }
+};
+
+template <class _StateZip, size_t _Index>
+struct __rcvr_t
+{
+  using receiver_concept = receiver_t;
+  using __state_t        = __unzip<_StateZip>;
+
+  __state_t& __state_;
+
+  template <class... _Ts>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_Ts&&... __ts) noexcept
+  {
+    constexpr auto idx = __mmake_indices<sizeof...(_Ts)>();
+    __state_.template __set_value<_Index>(idx, static_cast<_Ts&&>(__ts)...);
+    __state_.__arrive();
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+  {
+    __state_.__set_error(static_cast<_Error&&>(__error));
+    __state_.__arrive();
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() noexcept
+  {
+    __state_.__set_stopped();
+    __state_.__arrive();
+  }
+
+  _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env_t<_StateZip>
+  {
+    return {__state_};
+  }
+};
+
+template <class _CvSndr, size_t _Idx, class _StateZip>
+using __inner_completions_ = //
+  __mapply_q<__collect_inner, completion_signatures_of_t<_CvSndr, __rcvr_t<_StateZip, _Idx>>>;
+
+template <class _CvSndr, size_t _Idx, class _StateZip>
+using __inner_completions = //
+  __midentity_or_error_with< //
+    __inner_completions_<_CvSndr, _Idx, _StateZip>, //
+    _WITH_SENDER(_CvSndr)>;
+
+enum __estate_t : int
+{
+  __started,
+  __error,
+  __stopped
+};
+
+/// @brief The data stored in the operation state and refered to
+/// by the receiver.
+/// @tparam _Rcvr The receiver connected to the when_all sender.
+/// @tparam _CvFn A metafunction to apply cv- and ref-qualifiers to the senders
+/// @tparam _Sndrs A tuple of the when_all sender's child senders.
+template <class _Rcvr, class _CvFn, class _Sndrs>
+struct __state_t;
+
+template <class _Rcvr, class _CvFn, size_t... _Idx, class... _Sndrs>
+struct __state_t<_Rcvr, _CvFn, __tupl<__mindices<_Idx...>, _Sndrs...>>
+{
+  using __completions_offsets_pair_t = //
+    __collect_outer< //
+      __inner_completions<__minvoke1<_CvFn, _Sndrs>, _Idx, __zip<__state_t>>...>;
+  using __completions_t = __mfirst<__completions_offsets_pair_t>;
+  using __indices_t     = __mindices<_Idx...>;
+  using __offsets_t     = __msecond<__completions_offsets_pair_t>;
+  using __values_t      = __value_types<__completions_t, __lazy_tuple, __mpoly<__msingle_or<__nil>>::__f>;
+  using __errors_t      = __error_types<__completions_t, __variant>;
+
+  using __stop_tok_t      = stop_token_of_t<env_of_t<_Rcvr>>;
+  using __stop_callback_t = stop_callback_for_t<__stop_tok_t, __on_stop_request>;
+
+  _CCCL_HOST_DEVICE explicit __state_t(_Rcvr __rcvr, size_t __count)
+      : __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+      , __count_{__count}
+      , __stop_source_{}
+      , __stop_token_{__stop_source_.get_token()}
+      , __state_{__started}
+      , __errors_{}
+      , __values_{}
+      , __on_stop_{}
+  {}
+
+  template <size_t _Index, size_t... _Offsets>
+  static constexpr size_t __offset_for(__moffsets<_Offsets...>*) noexcept
+  {
+    constexpr size_t __offsets[] = {_Offsets..., 0};
+    return __offsets[_Index];
+  }
+
+  template <size_t _Index, size_t... _Jdx, class... _Ts>
+  _CCCL_HOST_DEVICE void __set_value(__mindices<_Jdx...>, [[maybe_unused]] _Ts&&... __ts) noexcept
+  {
+    [[maybe_unused]] constexpr size_t _Offset = __offset_for<_Index>(static_cast<__offsets_t*>(nullptr));
+    if constexpr (!_CUDA_VSTD::is_same_v<__values_t, __nil>)
+    {
+      if constexpr (__nothrow_decay_copyable<_Ts...>)
+      {
+        (__values_.template __emplace<_Jdx + _Offset>(static_cast<_Ts&&>(__ts)), ...);
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            (__values_.template __emplace<_Jdx + _Offset>(static_cast<_Ts&&>(__ts)), ...);
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __set_error(::std::current_exception());
+            }))
+      }
+    }
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE void __set_error(_Error&& __err) noexcept
+  {
+    // TODO: Use weaker memory orders
+    if (__error != __state_.exchange(__error))
+    {
+      __stop_source_.request_stop();
+      // We won the race, free to write the error into the operation state
+      // without worry.
+      if constexpr (__nothrow_decay_copyable<_Error>)
+      {
+        __errors_.template __emplace<__decay_t<_Error>>(static_cast<_Error&&>(__err));
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __errors_.template __emplace<__decay_t<_Error>>(static_cast<_Error&&>(__err));
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __errors_.template __emplace<::std::exception_ptr>(::std::current_exception());
+            }))
+      }
+    }
+  }
+
+  _CCCL_HOST_DEVICE void __set_stopped() noexcept
+  {
+    _CUDA_VSTD::underlying_type_t<__estate_t> __expected = __started;
+    // Transition to the "stopped" state if and only if we're in the
+    // "started" state. (If this fails, it's because we're in an
+    // error state, which trumps cancellation.)
+    if (__state_.compare_exchange_strong(__expected, static_cast<_CUDA_VSTD::underlying_type_t<__estate_t>>(__stopped)))
+    {
+      __stop_source_.request_stop();
+    }
+  }
+
+  _CCCL_HOST_DEVICE void __arrive() noexcept
+  {
+    if (0 == --__count_)
+    {
+      __complete();
+    }
+  }
+
+  _CCCL_HOST_DEVICE void __complete() noexcept
+  {
+    // Stop callback is no longer needed. Destroy it.
+    __on_stop_.destroy();
+    // All child operations have completed and arrived at the barrier.
+    switch (__state_.load(_CUDA_VSTD::memory_order_relaxed))
+    {
+      case __started:
+        if constexpr (!_CUDA_VSTD::is_same_v<__values_t, __nil>)
+        {
+          // All child operations completed successfully:
+          __values_.__apply(__async::set_value, static_cast<__values_t&&>(__values_), static_cast<_Rcvr&&>(__rcvr_));
+        }
+        break;
+      case __error:
+        // One or more child operations completed with an error:
+        __errors_.__visit(__async::set_error, static_cast<__errors_t&&>(__errors_), static_cast<_Rcvr&&>(__rcvr_));
+        break;
+      case __stopped:
+        __async::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
+        break;
+      default:;
+    }
+  }
+
+  _Rcvr __rcvr_;
+  _CUDA_VSTD::atomic<size_t> __count_;
+  inplace_stop_source __stop_source_;
+  inplace_stop_token __stop_token_;
+  _CUDA_VSTD::atomic<_CUDA_VSTD::underlying_type_t<__estate_t>> __state_;
+  __errors_t __errors_;
+  __values_t __values_;
+  __lazy<__stop_callback_t> __on_stop_;
+};
+
+/// The operation state for when_all
+template <class _Rcvr, class _CvFn, size_t... _Idx, class... _Sndrs>
+struct __opstate_t<_Rcvr, _CvFn, __tupl<__mindices<_Idx...>, _Sndrs...>>
+{
+  using operation_state_concept = operation_state_t;
+  using __sndrs_t               = __minvoke<_CvFn, __tuple<_Sndrs...>>;
+  using __state_t               = __when_all::__state_t<_Rcvr, _CvFn, __tupl<__mindices<_Idx...>, _Sndrs...>>;
+
+  using completion_signatures = typename __state_t::__completions_t;
+  using __offsets_t           = typename __state_t::__offsets_t;
+
+  // This function object is used to connect all the sub-operations with
+  // receivers, each of which knows which elements in the values tuple it
+  // is responsible for setting.
+  struct __connect_subs_fn
+  {
+    template <class... _CvSndrs>
+    _CCCL_HOST_DEVICE auto operator()(__state_t& __state, _CvSndrs&&... __sndrs_) const
+    {
+      using __state_ref_t = __zip<__state_t>;
+      if constexpr (_CUDA_VSTD::is_same_v<__offsets_t, __moffsets<>>)
+      {
+        // When there are no offsets, the when_all sender has no value
+        // completions. All child senders can be connected to receivers
+        // of the same type.
+        return __tupl{__async::connect(static_cast<_CvSndrs&&>(__sndrs_), __rcvr_t<__state_ref_t, 0>{__state})...};
+      }
+      else
+      {
+        // The offsets are used to determine which elements in the values
+        // tuple each receiver is responsible for setting.
+        return __tupl{__async::connect(static_cast<_CvSndrs&&>(__sndrs_), __rcvr_t<__state_ref_t, _Idx>{__state})...};
+      }
+    }
+  };
+
+  // This is a tuple of operation states for the sub-operations.
+  using __sub_opstates_t = __apply_result_t<__connect_subs_fn, __sndrs_t, __state_t&>;
+
+  __state_t __state_;
+  __sub_opstates_t __sub_ops_;
+
+  /// Initialize the data member, connect all the sub-operations and
+  /// save the resulting operation states in __sub_ops_.
+  _CCCL_HOST_DEVICE __opstate_t(__sndrs_t&& __sndrs_, _Rcvr __rcvr)
+      : __state_{static_cast<_Rcvr&&>(__rcvr), sizeof...(_Sndrs)}
+      , __sub_ops_{__sndrs_.__apply(__connect_subs_fn(), static_cast<__sndrs_t&&>(__sndrs_), __state_)}
+  {}
+
+  _CUDAX_IMMOVABLE(__opstate_t);
+
+  /// Start all the sub-operations.
+  _CCCL_HOST_DEVICE void start() & noexcept
+  {
+    // register stop callback:
+    __state_.__on_stop_.construct(
+      get_stop_token(__async::get_env(__state_.__rcvr_)), __on_stop_request{__state_.__stop_source_});
+
+    if (__state_.__stop_source_.stop_requested())
+    {
+      // Manually clean up the stop callback. We won't be starting the
+      // sub-operations, so they won't complete and clean up for us.
+      __state_.__on_stop_.destroy();
+
+      // Stop has already been requested. Don't bother starting the child
+      // operations.
+      __async::set_stopped(static_cast<_Rcvr&&>(__state_.__rcvr_));
+    }
+    else
+    {
+      // Start all the sub-operations.
+      __sub_ops_.__for_each(__async::start, __sub_ops_);
+
+      // If there are no sub-operations, we're done.
+      if constexpr (sizeof...(_Sndrs) == 0)
+      {
+        __state_.__complete();
+      }
+    }
+  }
+};
+
+template <class... _Sndrs>
+struct __sndr_t;
+} // namespace __when_all
+
+struct when_all_t
+{
+  template <class... _Sndrs>
+  _CCCL_HOST_DEVICE __when_all::__sndr_t<_Sndrs...> operator()(_Sndrs... __sndrs_) const;
+};
+
+// The sender for when_all
+template <class... _Sndrs>
+struct __when_all::__sndr_t
+{
+  using sender_concept = sender_t;
+  using __sndrs_t      = __tuple<_Sndrs...>;
+
+  _CCCL_NO_UNIQUE_ADDRESS when_all_t __tag_;
+  _CCCL_NO_UNIQUE_ADDRESS __ignore __ignore1_;
+  __sndrs_t __sndrs_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate_t<_Rcvr, __cp, __sndrs_t>
+  {
+    return __opstate_t<_Rcvr, __cp, __sndrs_t>(static_cast<__sndrs_t&&>(__sndrs_), static_cast<_Rcvr&&>(__rcvr));
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& //
+    -> __opstate_t<_Rcvr, __cpclr, __sndrs_t>
+  {
+    return __opstate_t<_Rcvr, __cpclr, __sndrs_t>(__sndrs_, static_cast<_Rcvr&&>(__rcvr));
+  }
+};
+
+template <class... _Sndrs>
+_CCCL_HOST_DEVICE __when_all::__sndr_t<_Sndrs...> when_all_t::operator()(_Sndrs... __sndrs_) const
+{
+  // If the incoming sender is non-dependent, we can check the completion
+  // signatures of the composed sender immediately.
+  if constexpr ((__is_non_dependent_sender<_Sndrs> && ...))
+  {
+    using __completions = completion_signatures_of_t<__when_all::__sndr_t<_Sndrs...>>;
+    static_assert(__is_completion_signatures<__completions>);
+  }
+  return __when_all::__sndr_t<_Sndrs...>{{}, {}, {static_cast<_Sndrs&&>(__sndrs_)...}};
+}
+
+_CCCL_GLOBAL_CONSTANT when_all_t when_all{};
+
+} // namespace cuda::experimental::__async
+
+_CCCL_NV_DIAG_DEFAULT(expr_has_no_effect)
+_CCCL_DIAG_POP
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/write_env.cuh b/cudax/include/cuda/experimental/__async/write_env.cuh
new file mode 100644
index 0000000000..2bc2e5064e
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/write_env.cuh
@@ -0,0 +1,118 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_WRITE_ENV
+#define __CUDAX_ASYNC_DETAIL_WRITE_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/rcvr_with_env.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct write_env_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  template <class _Rcvr, class _Sndr, class _Env>
+  struct __opstate_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = completion_signatures_of_t<_Sndr, __rcvr_with_env_t<_Rcvr, _Env>*>;
+
+    __rcvr_with_env_t<_Rcvr, _Env> __env_rcvr_;
+    connect_result_t<_Sndr, __rcvr_with_env_t<_Rcvr, _Env>*> __opstate_;
+
+    _CCCL_HOST_DEVICE explicit __opstate_t(_Sndr&& __sndr, _Env __env, _Rcvr __rcvr)
+        : __env_rcvr_(static_cast<_Env&&>(__env), static_cast<_Rcvr&&>(__rcvr))
+        , __opstate_(__async::connect(static_cast<_Sndr&&>(__sndr), &__env_rcvr_))
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate_);
+    }
+  };
+
+  template <class _Sndr, class _Env>
+  struct __sndr_t;
+
+public:
+  /// @brief Wraps one sender in another that modifies the execution
+  /// environment by merging in the environment specified.
+  template <class _Sndr, class _Env>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr auto operator()(_Sndr, _Env) const //
+    -> __sndr_t<_Sndr, _Env>;
+};
+
+template <class _Sndr, class _Env>
+struct write_env_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS write_env_t __tag_;
+  _Env __env_;
+  _Sndr __sndr_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate_t<_Rcvr, _Sndr, _Env>
+  {
+    return __opstate_t<_Rcvr, _Sndr, _Env>{
+      static_cast<_Sndr&&>(__sndr_), static_cast<_Env&&>(__env_), static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& //
+    -> __opstate_t<_Rcvr, const _Sndr&, _Env>
+  {
+    return __opstate_t<_Rcvr, const _Sndr&, _Env>{__sndr_, __env_, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+  {
+    return __async::get_env(__sndr_);
+  }
+};
+
+template <class _Sndr, class _Env>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr auto write_env_t::operator()(_Sndr __sndr, _Env __env) const //
+  -> write_env_t::__sndr_t<_Sndr, _Env>
+{
+  return write_env_t::__sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__env), static_cast<_Sndr&&>(__sndr)};
+}
+
+_CCCL_GLOBAL_CONSTANT write_env_t write_env{};
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index 45a439182d..9c277b3073 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -69,6 +69,7 @@ foreach(cn_target IN LISTS cudax_TARGETS)
     device/arch_traits.cu
   )
   target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
 
   cudax_add_catch2_test(test_target event_tests ${cn_target}
     event/event_smoke.cu
@@ -92,4 +93,15 @@ foreach(cn_target IN LISTS cudax_TARGETS)
   cudax_add_catch2_test(test_target memory_resource ${cn_target}
     memory_resource/any_resource.cu
   )
+
+  cudax_add_catch2_test(test_target async_tests ${cn_target}
+    async/test_conditional.cu
+    async/test_continue_on.cu
+    async/test_just.cu
+    async/test_sequence.cu
+    async/test_when_all.cu
+  )
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
+
 endforeach()
diff --git a/cudax/test/async/common/checked_receiver.cuh b/cudax/test/async/common/checked_receiver.cuh
new file mode 100755
index 0000000000..25eb1d3ae2
--- /dev/null
+++ b/cudax/test/async/common/checked_receiver.cuh
@@ -0,0 +1,125 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+#include "testing.cuh"
+
+namespace
+{
+template <class... Values>
+struct checked_value_receiver
+{
+  using receiver_concept = cudax_async::receiver_t;
+
+  checked_value_receiver(Values... values)
+      : _values{{values}...}
+  {}
+
+  // This overload is needed to avoid an nvcc compiler bug where a variadic
+  // pack is not visible within the scope of a lambda.
+  _CCCL_HOST_DEVICE void set_value() && noexcept
+  {
+    if constexpr (!_CUDA_VSTD::is_same_v<cudax_async::__mlist<Values...>, cudax_async::__mlist<>>)
+    {
+      CUDAX_FAIL("expected a value completion; got a different value");
+    }
+  }
+
+  template <class... As>
+  _CCCL_HOST_DEVICE void set_value(As... as) && noexcept
+  {
+    if constexpr (_CUDA_VSTD::is_same_v<cudax_async::__mlist<Values...>, cudax_async::__mlist<As...>>)
+    {
+      _values.__apply(
+        [&](auto const&... vs) {
+          CUDAX_CHECK(((vs == as) && ...));
+        },
+        _values);
+    }
+    else
+    {
+      CUDAX_FAIL("expected a value completion; got a different value");
+    }
+  }
+
+  template <class Error>
+  _CCCL_HOST_DEVICE void set_error(Error) && noexcept
+  {
+    CUDAX_FAIL("expected a value completion; got an error");
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() && noexcept
+  {
+    CUDAX_FAIL("expected a value completion; got stopped");
+  }
+
+  cudax_async::__tuple<Values...> _values;
+};
+
+template <class... Values>
+checked_value_receiver(Values...) -> checked_value_receiver<Values...>;
+
+template <class Error>
+struct checked_error_receiver
+{
+  using receiver_concept = cudax_async::receiver_t;
+
+  template <class... As>
+  _CCCL_HOST_DEVICE void set_value(As...) && noexcept
+  {
+    CUDAX_FAIL("expected an error completion; got a value");
+  }
+
+  template <class Ty>
+  _CCCL_HOST_DEVICE void set_error(Ty ty) && noexcept
+  {
+    if constexpr (_CUDA_VSTD::is_same_v<Error, Ty>)
+    {
+      CUDAX_CHECK(ty == _error);
+    }
+    else
+    {
+      CUDAX_FAIL("expected an error completion; got a different error");
+    }
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() && noexcept
+  {
+    CUDAX_FAIL("expected a value completion; got stopped");
+  }
+
+  Error _error;
+};
+
+template <class Error>
+checked_error_receiver(Error) -> checked_error_receiver<Error>;
+
+struct checked_stopped_receiver
+{
+  using receiver_concept = cudax_async::receiver_t;
+
+  template <class... As>
+  _CCCL_HOST_DEVICE void set_value(As...) && noexcept
+  {
+    CUDAX_FAIL("expected a stopped completion; got a value");
+  }
+
+  template <class Ty>
+  _CCCL_HOST_DEVICE void set_error(Ty) && noexcept
+  {
+    CUDAX_FAIL("expected an stopped completion; got an error");
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() && noexcept {}
+};
+
+} // namespace
diff --git a/cudax/test/async/common/error_scheduler.cuh b/cudax/test/async/common/error_scheduler.cuh
new file mode 100755
index 0000000000..8df7f08576
--- /dev/null
+++ b/cudax/test/async/common/error_scheduler.cuh
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+
+namespace
+{
+//! Scheduler that returns a sender that always completes with error.
+template <class Error>
+struct error_scheduler
+{
+private:
+  struct env_t
+  {
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return error_scheduler{};
+    }
+
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_stopped_t>) const noexcept
+    {
+      return error_scheduler{};
+    }
+  };
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures   = //
+      cudax_async::completion_signatures< //
+        cudax_async::set_value_t(), //
+        cudax_async::set_error_t(Error),
+        cudax_async::set_stopped_t()>;
+
+    Rcvr _rcvr;
+    Error _err;
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      cudax_async::set_error(static_cast<Rcvr&&>(_rcvr), static_cast<Error&&>(_err));
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept        = cudax_async::sender_t;
+    using completion_signatures = //
+      cudax_async::completion_signatures< //
+        cudax_async::set_value_t(), //
+        cudax_async::set_error_t(Error),
+        cudax_async::set_stopped_t()>;
+
+    template <class Rcvr>
+    _CCCL_HOST_DEVICE opstate_t<Rcvr> connect(Rcvr rcvr) const
+    {
+      return {{}, static_cast<Rcvr&&>(rcvr), _err};
+    }
+
+    _CCCL_HOST_DEVICE env_t get_env() const noexcept
+    {
+      return {};
+    }
+
+    Error _err;
+  };
+
+  _CCCL_HOST_DEVICE friend bool operator==(error_scheduler, error_scheduler) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(error_scheduler, error_scheduler) noexcept
+  {
+    return false;
+  }
+
+  Error _err{};
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  _CCCL_HOST_DEVICE explicit error_scheduler(Error err)
+      : _err(static_cast<Error&&>(err))
+  {}
+
+  _CCCL_HOST_DEVICE sndr_t schedule() const noexcept
+  {
+    return {_err};
+  }
+};
+} // namespace
diff --git a/cudax/test/async/common/impulse_scheduler.cuh b/cudax/test/async/common/impulse_scheduler.cuh
new file mode 100755
index 0000000000..3517e5ac29
--- /dev/null
+++ b/cudax/test/async/common/impulse_scheduler.cuh
@@ -0,0 +1,200 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include "cuda/experimental/__async/async.cuh"
+
+#if !defined(__CUDA_ARCH__)
+
+namespace
+{
+//! Scheduler that will send impulses on user's request.
+//! One can obtain senders from this, connect them to receivers and start the operation states.
+//! Until the scheduler is told to start the next operation, the actions in the operation states are
+//! not executed. This is similar to a task scheduler, but it's single threaded. It has basic
+//! thread-safety to allow it to be run with `sync_wait` (which makes us not control when the
+//! operation_state object is created and started).
+struct impulse_scheduler
+{
+private:
+  //! Command type that can store the action of firing up a sender
+  using cmd_t     = std::function<void()>;
+  using cmd_vec_t = std::vector<cmd_t>;
+
+  struct data_t : std::enable_shared_from_this<data_t>
+  {
+    explicit data_t(int id)
+        : id_(id)
+    {}
+
+    int id_;
+    std::mutex mutex_;
+    std::condition_variable cv_;
+    std::vector<std::function<void()>> all_commands_;
+  };
+
+  //! That data_t shared between the operation state and the actual scheduler
+  //! Shared pointer to allow the scheduler to be copied (not the best semantics, but it will do)
+  std::shared_ptr<data_t> data_{};
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures =
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    data_t* data_;
+    Rcvr rcvr_;
+
+    opstate_t(data_t* data, Rcvr&& rcvr)
+        : data_(data)
+        , rcvr_(static_cast<Rcvr&&>(rcvr))
+    {}
+
+    void start() & noexcept
+    {
+      // Enqueue another command to the list of all commands
+      // The scheduler will start this, whenever start_next() is called
+      std::unique_lock lock{data_->mutex_};
+      data_->all_commands_.emplace_back([this]() {
+        if (cudax_async::get_stop_token(cudax_async::get_env(rcvr_)).stop_requested())
+        {
+          cudax_async::set_stopped(static_cast<Rcvr&&>(rcvr_));
+        }
+        else
+        {
+          cudax_async::set_value(static_cast<Rcvr&&>(rcvr_));
+        }
+      });
+      data_->cv_.notify_all();
+    }
+  };
+
+  struct env_t
+  {
+    data_t* data_;
+
+    impulse_scheduler query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return impulse_scheduler{data_};
+    }
+
+    impulse_scheduler query(cudax_async::get_completion_scheduler_t<cudax_async::set_stopped_t>) const noexcept
+    {
+      return impulse_scheduler{data_};
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept = cudax_async::sender_t;
+    using completion_signatures =
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    data_t* data_;
+
+    template <class Rcvr>
+    opstate_t<Rcvr> connect(Rcvr rcvr)
+    {
+      return {data_, static_cast<Rcvr&&>(rcvr)};
+    }
+
+    auto get_env() const noexcept
+    {
+      return env_t{data_};
+    }
+  };
+
+  explicit impulse_scheduler(data_t* data)
+      : data_(data->shared_from_this())
+  {}
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  impulse_scheduler()
+      : data_(std::make_shared<data_t>(0))
+  {}
+
+  explicit impulse_scheduler(int id)
+      : data_(std::make_shared<data_t>(id))
+  {}
+
+  ~impulse_scheduler() = default;
+
+  //! Actually start the command from the last started operation_state
+  //! Returns immediately if no command registered (i.e., no operation state started)
+  bool try_start_next()
+  {
+    // Wait for a command that we can execute
+    std::unique_lock lock{data_->mutex_};
+
+    // If there are no commands in the queue, return false
+    if (data_->all_commands_.empty())
+    {
+      return false;
+    }
+
+    // Pop one command from the queue
+    auto cmd = std::move(data_->all_commands_.front());
+    data_->all_commands_.erase(data_->all_commands_.begin());
+    // Exit the lock before executing the command
+    lock.unlock();
+    // Execute the command, i.e., send an impulse to the connected sender
+    cmd();
+    // Return true to signal that we started a command
+    return true;
+  }
+
+  //! Actually start the command from the last started operation_state
+  //! Blocks if no command registered (i.e., no operation state started)
+  void start_next()
+  {
+    // Wait for a command that we can execute
+    std::unique_lock lock{data_->mutex_};
+    while (data_->all_commands_.empty())
+    {
+      data_->cv_.wait(lock);
+    }
+
+    // Pop one command from the queue
+    auto cmd = std::move(data_->all_commands_.front());
+    data_->all_commands_.erase(data_->all_commands_.begin());
+    // Exit the lock before executing the command
+    lock.unlock();
+    // Execute the command, i.e., send an impulse to the connected sender
+    cmd();
+  }
+
+  sndr_t schedule() const noexcept
+  {
+    return sndr_t{data_.get()};
+  }
+
+  friend bool operator==(impulse_scheduler, impulse_scheduler) noexcept
+  {
+    return true;
+  }
+
+  friend bool operator!=(impulse_scheduler, impulse_scheduler) noexcept
+  {
+    return false;
+  }
+};
+} // namespace
+
+#endif
diff --git a/cudax/test/async/common/inline_scheduler.cuh b/cudax/test/async/common/inline_scheduler.cuh
new file mode 100755
index 0000000000..1400397b93
--- /dev/null
+++ b/cudax/test/async/common/inline_scheduler.cuh
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+
+namespace
+{
+//! Scheduler that returns a sender that always completes inline
+//! (successfully).
+struct inline_scheduler
+{
+private:
+  struct env_t
+  {
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return inline_scheduler{};
+    }
+  };
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures   = cudax_async::completion_signatures<cudax_async::set_value_t()>;
+
+    Rcvr _rcvr;
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      cudax_async::set_value(static_cast<Rcvr&&>(_rcvr));
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept        = cudax_async::sender_t;
+    using completion_signatures = cudax_async::completion_signatures<cudax_async::set_value_t()>;
+
+    template <class Rcvr>
+    _CCCL_HOST_DEVICE opstate_t<Rcvr> connect(Rcvr rcvr) const
+    {
+      return {{}, static_cast<Rcvr&&>(rcvr)};
+    }
+
+    _CCCL_HOST_DEVICE env_t get_env() const noexcept
+    {
+      return {};
+    }
+  };
+
+  _CCCL_HOST_DEVICE friend bool operator==(inline_scheduler, inline_scheduler) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(inline_scheduler, inline_scheduler) noexcept
+  {
+    return false;
+  }
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  inline_scheduler() = default;
+
+  _CCCL_HOST_DEVICE sndr_t schedule() const noexcept
+  {
+    return {};
+  }
+};
+} // namespace
diff --git a/cudax/test/async/common/stopped_scheduler.cuh b/cudax/test/async/common/stopped_scheduler.cuh
new file mode 100755
index 0000000000..006d7a0652
--- /dev/null
+++ b/cudax/test/async/common/stopped_scheduler.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+namespace
+{
+//! Scheduler that returns a sender that always completes with stopped.
+struct stopped_scheduler
+{
+private:
+  struct env_t
+  {
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return stopped_scheduler{};
+    }
+
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_stopped_t>) const noexcept
+    {
+      return stopped_scheduler{};
+    }
+  };
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures   = //
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    Rcvr _rcvr;
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      cudax_async::set_stopped(static_cast<Rcvr&&>(_rcvr));
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept        = cudax_async::sender_t;
+    using completion_signatures = //
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    template <class Rcvr>
+    _CCCL_HOST_DEVICE opstate_t<Rcvr> connect(Rcvr rcvr) const
+    {
+      return {{}, static_cast<Rcvr&&>(rcvr)};
+    }
+
+    _CCCL_HOST_DEVICE env_t get_env() const noexcept
+    {
+      return {};
+    }
+  };
+
+  _CCCL_HOST_DEVICE friend bool operator==(stopped_scheduler, stopped_scheduler) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(stopped_scheduler, stopped_scheduler) noexcept
+  {
+    return false;
+  }
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  stopped_scheduler() = default;
+
+  _CCCL_HOST_DEVICE sndr_t schedule() const noexcept
+  {
+    return {};
+  }
+};
+} // namespace
diff --git a/cudax/test/async/common/utility.cuh b/cudax/test/async/common/utility.cuh
new file mode 100755
index 0000000000..f0867d36f7
--- /dev/null
+++ b/cudax/test/async/common/utility.cuh
@@ -0,0 +1,189 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+#include "testing.cuh"
+
+//! A move-only type
+struct movable
+{
+  _CCCL_HOST_DEVICE movable(int value)
+      : value_(value)
+  {}
+
+  movable(movable&&) = default;
+
+  _CCCL_HOST_DEVICE friend bool operator==(const movable& a, const movable& b) noexcept
+  {
+    return a.value_ == b.value_;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const movable& a, const movable& b) noexcept
+  {
+    return a.value_ != b.value_;
+  }
+
+  _CCCL_HOST_DEVICE int value()
+  {
+    return value_;
+  } // silence warning of unused private field
+
+private:
+  int value_;
+};
+
+//! A type with potentially throwing move/copy constructors
+struct potentially_throwing
+{
+  potentially_throwing() = default;
+
+  _CCCL_HOST_DEVICE potentially_throwing(potentially_throwing&&) noexcept(false) {}
+
+  _CCCL_HOST_DEVICE potentially_throwing(const potentially_throwing&) noexcept(false) {}
+
+  _CCCL_HOST_DEVICE potentially_throwing& operator=(potentially_throwing&&) noexcept(false)
+  {
+    return *this;
+  }
+
+  _CCCL_HOST_DEVICE potentially_throwing& operator=(const potentially_throwing&) noexcept(false)
+  {
+    return *this;
+  }
+};
+
+struct string
+{
+  string() = default;
+
+  _CCCL_HOST_DEVICE explicit string(char const* c)
+  {
+    std::size_t len = 0;
+    while (c[len++])
+      ;
+    char* tmp = str = new char[len];
+    while (*tmp++ = *c++)
+      ;
+  }
+
+  _CCCL_HOST_DEVICE string(string&& other) noexcept
+      : str(other.str)
+  {
+    other.str = nullptr;
+  }
+
+  _CCCL_HOST_DEVICE string(const string& other)
+      : string(string(other.str))
+  {}
+
+  _CCCL_HOST_DEVICE ~string()
+  {
+    delete[] str;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator==(const string& left, const string& right) noexcept
+  {
+    char const* l = left.str;
+    char const* r = right.str;
+    while (*l && *r)
+    {
+      if (*l++ != *r++)
+      {
+        return false;
+      }
+    }
+    return *l == *r;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const string& left, const string& right) noexcept
+  {
+    return !(left == right);
+  }
+
+private:
+  char* str{};
+};
+
+struct error_code
+{
+  _CCCL_HOST_DEVICE friend bool operator==(const error_code& left, const error_code& right) noexcept
+  {
+    return left.ec == right.ec;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const error_code& left, const error_code& right) noexcept
+  {
+    return !(left == right);
+  }
+
+  std::errc ec;
+};
+
+// run_loop isn't supported on-device yet, so neither can sync_wait be.
+#if !defined(__CUDA_ARCH__)
+
+template <class Sndr, class... Values>
+void check_values(Sndr&& sndr, const Values&... values) noexcept
+{
+  try
+  {
+    auto opt = cudax_async::sync_wait(static_cast<Sndr&&>(sndr));
+    if (!opt)
+    {
+      CUDAX_FAIL("Expected value completion; got stopped instead.");
+    }
+    else
+    {
+      auto&& vals = *opt;
+      CUDAX_CHECK(vals == ::cuda::std::tie(values...));
+    }
+  }
+  catch (...)
+  {
+    CUDAX_FAIL("Expected value completion; got error instead.");
+  }
+}
+
+#else // !defined(__CUDA_ARCH__)
+
+template <class Sndr, class... Values>
+void check_values(Sndr&& sndr, const Values&... values) noexcept
+{}
+
+#endif // !defined(__CUDA_ARCH__)
+
+template <class... Ts>
+using types = cudax_async::__mlist<Ts...>;
+
+template <class... Values, class Sndr>
+_CCCL_HOST_DEVICE void check_value_types(Sndr&&) noexcept
+{
+  using actual_t   = cudax_async::value_types_of_t<Sndr, cudax_async::env<>, types, cudax_async::__mmake_set>;
+  using expected_t = cudax_async::__mmake_set<Values...>;
+
+  static_assert(cudax_async::__mset_eq<expected_t, actual_t>, "value_types_of_t does not match expected types");
+}
+
+template <class... Errors, class Sndr>
+_CCCL_HOST_DEVICE void check_error_types(Sndr&&) noexcept
+{
+  using actual_t   = cudax_async::error_types_of_t<Sndr, cudax_async::env<>, cudax_async::__mmake_set>;
+  using expected_t = cudax_async::__mmake_set<Errors...>;
+
+  static_assert(cudax_async::__mset_eq<expected_t, actual_t>, "error_types_of_t does not match expected types");
+}
+
+template <bool SendsStopped, class Sndr>
+_CCCL_HOST_DEVICE void check_sends_stopped(Sndr&&) noexcept
+{
+  static_assert(cudax_async::sends_stopped<Sndr> == SendsStopped, "sends_stopped does not match expected value");
+}
diff --git a/cudax/test/async/test_conditional.cu b/cudax/test/async/test_conditional.cu
new file mode 100755
index 0000000000..702d97f94b
--- /dev/null
+++ b/cudax/test/async/test_conditional.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License Version 2.0 with LLVM Exceptions
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   https://llvm.org/LICENSE.txt
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Include this first
+#include <cuda/experimental/__async/async.cuh>
+
+// Then include the test helpers
+#include "common/checked_receiver.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+
+namespace
+{
+TEST_CASE("simple use of conditional runs exactly one of the two closures", "[adaptors][conditional]")
+{
+  for (int i = 42; i < 44; ++i)
+  {
+    bool even{false};
+    bool odd{false};
+
+    auto sndr1 =
+      cudax_async::just(i)
+      | cudax_async::conditional(
+        [](int i) {
+          return i % 2 == 0;
+        },
+        cudax_async::then([&](int) {
+          even = true;
+        }),
+        cudax_async::then([&](int) {
+          odd = true;
+        }));
+
+    check_value_types<types<>>(sndr1);
+    check_sends_stopped<false>(sndr1);
+    NV_IF_ELSE_TARGET(NV_IS_HOST, //
+                      ({ check_error_types<std::exception_ptr>(sndr1); }),
+                      ({ check_error_types<>(sndr1); }));
+
+    auto op = cudax_async::connect(std::move(sndr1), checked_value_receiver<>{});
+    cudax_async::start(op);
+
+    CUDAX_CHECK(even == (i % 2 == 0));
+    CUDAX_CHECK(odd == (i % 2 == 1));
+  }
+}
+
+} // namespace
diff --git a/cudax/test/async/test_continue_on.cu b/cudax/test/async/test_continue_on.cu
new file mode 100755
index 0000000000..ed6c61631d
--- /dev/null
+++ b/cudax/test/async/test_continue_on.cu
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include "cuda/experimental/__async/async.cuh"
+
+//
+#include "common/checked_receiver.cuh"
+#include "common/error_scheduler.cuh"
+#include "common/impulse_scheduler.cuh"
+#include "common/inline_scheduler.cuh"
+#include "common/stopped_scheduler.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+
+namespace
+{
+TEST_CASE("continue_on simple example", "[adaptors][continue_on]")
+{
+  auto snd = cudax_async::continue_on(cudax_async::just(13), inline_scheduler{});
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+#if !defined(__CUDA_ARCH__)
+
+TEST_CASE("continue_on can be piped", "[adaptors][continue_on]")
+{
+  // Just continue_on a value to the impulse scheduler
+  bool called{false};
+  auto sched = impulse_scheduler{};
+  auto snd   = cudax_async::just(13) //
+           | cudax_async::continue_on(sched) //
+           | cudax_async::then([&](int val) {
+               called = true;
+               return val;
+             });
+  // Start the operation
+  auto op = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+
+  // The value will be available when the scheduler will execute the next operation
+  CUDAX_REQUIRE(!called);
+  sched.start_next();
+  CUDAX_REQUIRE(called);
+}
+
+TEST_CASE("continue_on calls the receiver when the scheduler dictates", "[adaptors][continue_on]")
+{
+  bool called{false};
+  impulse_scheduler sched;
+  auto snd = cudax_async::then(cudax_async::continue_on(cudax_async::just(13), sched), [&](int val) {
+    called = true;
+    return val;
+  });
+  auto op  = cudax_async::connect(snd, checked_value_receiver{13});
+  cudax_async::start(op);
+  // Up until this point, the scheduler didn't start any task; no effect expected
+  CUDAX_CHECK(!called);
+
+  // Tell the scheduler to start executing one task
+  sched.start_next();
+  CUDAX_CHECK(called);
+}
+
+TEST_CASE("continue_on calls the given sender when the scheduler dictates", "[adaptors][continue_on]")
+{
+  int counter{0};
+  auto snd_base = cudax_async::just() //
+                | cudax_async::then([&]() -> int {
+                    ++counter;
+                    return 19;
+                  });
+
+  impulse_scheduler sched;
+  auto snd = cudax_async::then(cudax_async::continue_on(std::move(snd_base), sched), [&](int val) {
+    ++counter;
+    return val;
+  });
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{19});
+  cudax_async::start(op);
+  // The sender is started, even if the scheduler hasn't yet triggered
+  CUDAX_CHECK(counter == 1);
+  // ... but didn't send the value to the receiver yet
+
+  // Tell the scheduler to start executing one task
+  sched.start_next();
+
+  // Now the base sender is called, and a value is sent to the receiver
+  CUDAX_CHECK(counter == 2);
+}
+
+TEST_CASE("continue_on works when changing threads", "[adaptors][continue_on]")
+{
+  cudax_async::thread_context thread;
+  bool called{false};
+
+  {
+    // lunch some work on the thread pool
+    auto snd = cudax_async::continue_on(cudax_async::just(), thread.get_scheduler()) //
+             | cudax_async::then([&] {
+                 called = true;
+               });
+    cudax_async::start_detached(std::move(snd));
+  }
+
+  thread.join();
+
+  // the work should be executed
+  CUDAX_REQUIRE(called);
+}
+
+#endif
+
+TEST_CASE("continue_on can be called with rvalue ref scheduler", "[adaptors][continue_on]")
+{
+  auto snd = cudax_async::continue_on(cudax_async::just(13), inline_scheduler{});
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+TEST_CASE("continue_on can be called with const ref scheduler", "[adaptors][continue_on]")
+{
+  const inline_scheduler sched;
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+TEST_CASE("continue_on can be called with ref scheduler", "[adaptors][continue_on]")
+{
+  inline_scheduler sched;
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+TEST_CASE("continue_on forwards set_error calls", "[adaptors][continue_on]")
+{
+  auto ec = error_code{std::errc::invalid_argument};
+  error_scheduler<error_code> sched{ec};
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_error_receiver{ec});
+  cudax_async::start(op);
+  // The receiver checks if we receive an error
+}
+
+TEST_CASE("continue_on forwards set_error calls of other types", "[adaptors][continue_on]")
+{
+  error_scheduler<string> sched{string{"error"}};
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_error_receiver{string{"error"}});
+  cudax_async::start(op);
+  // The receiver checks if we receive an error
+}
+
+TEST_CASE("continue_on forwards set_stopped calls", "[adaptors][continue_on]")
+{
+  stopped_scheduler sched{};
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_stopped_receiver{});
+  cudax_async::start(op);
+  // The receiver checks if we receive the stopped signal
+}
+
+TEST_CASE("continue_on has the values_type corresponding to the given values", "[adaptors][continue_on]")
+{
+  inline_scheduler sched{};
+
+  check_value_types<types<int>>(cudax_async::continue_on(cudax_async::just(1), sched));
+  check_value_types<types<int, double>>(cudax_async::continue_on(cudax_async::just(3, 0.14), sched));
+  check_value_types<types<int, double, string>>(
+    cudax_async::continue_on(cudax_async::just(3, 0.14, string{"pi"}), sched));
+}
+
+TEST_CASE("continue_on keeps error_types from scheduler's sender", "[adaptors][continue_on]")
+{
+  inline_scheduler sched1{};
+  error_scheduler<std::error_code> sched2{std::make_error_code(std::errc::invalid_argument)};
+  error_scheduler<int> sched3{43};
+
+  check_error_types<>(cudax_async::continue_on(cudax_async::just(1), sched1));
+  check_error_types<std::error_code>(cudax_async::continue_on(cudax_async::just(2), sched2));
+  check_error_types<int>(cudax_async::continue_on(cudax_async::just(3), sched3));
+}
+
+TEST_CASE("continue_on sends an exception_ptr if value types are potentially throwing when copied",
+          "[adaptors][continue_on]")
+{
+  inline_scheduler sched{};
+
+#if !defined(__CUDA_ARCH__)
+  check_error_types<std::exception_ptr>(cudax_async::continue_on(cudax_async::just(potentially_throwing{}), sched));
+#else
+  // No exceptions in device code:
+  check_error_types<>(cudax_async::continue_on(cudax_async::just(potentially_throwing{}), sched));
+#endif
+}
+
+TEST_CASE("continue_on keeps sends_stopped from scheduler's sender", "[adaptors][continue_on]")
+{
+  inline_scheduler sched1{};
+  error_scheduler<error_code> sched2{error_code{std::errc::invalid_argument}};
+  stopped_scheduler sched3{};
+
+  check_sends_stopped<false>(cudax_async::continue_on(cudax_async::just(1), sched1));
+  check_sends_stopped<true>(cudax_async::continue_on(cudax_async::just(2), sched2));
+  check_sends_stopped<true>(cudax_async::continue_on(cudax_async::just(3), sched3));
+}
+} // namespace
diff --git a/cudax/test/async/test_just.cu b/cudax/test/async/test_just.cu
new file mode 100755
index 0000000000..bc54274cbe
--- /dev/null
+++ b/cudax/test/async/test_just.cu
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/__async/async.cuh>
+
+#include "testing.cuh"
+
+TEST_CASE("this is a dummy test", "[just]")
+{
+  CUDAX_REQUIRE(1 == 1);
+}
diff --git a/cudax/test/async/test_sequence.cu b/cudax/test/async/test_sequence.cu
new file mode 100755
index 0000000000..216b8e9f33
--- /dev/null
+++ b/cudax/test/async/test_sequence.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License Version 2.0 with LLVM Exceptions
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   https://llvm.org/LICENSE.txt
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Include this first
+#include <cuda/experimental/__async/async.cuh>
+
+// Then include the test helpers
+#include "common/checked_receiver.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+#include <nv/target>
+
+namespace
+{
+TEST_CASE("simple use of sequence executes both child operations", "[adaptors][sequence]")
+{
+  bool flag1{false};
+  bool flag2{false};
+
+  auto sndr1 = cudax_async::sequence(
+    cudax_async::just() | cudax_async::then([&] {
+      flag1 = true;
+    }),
+    cudax_async::just() | cudax_async::then([&] {
+      flag2 = true;
+    }));
+
+  check_value_types<types<>>(sndr1);
+  check_sends_stopped<false>(sndr1);
+  NV_IF_ELSE_TARGET(NV_IS_HOST, //
+                    ({ check_error_types<std::exception_ptr>(sndr1); }),
+                    ({ check_error_types<>(sndr1); }));
+
+  auto op = cudax_async::connect(std::move(sndr1), checked_value_receiver<>{});
+  cudax_async::start(op);
+
+  CUDAX_CHECK(flag1);
+  CUDAX_CHECK(flag2);
+}
+
+} // namespace
diff --git a/cudax/test/async/test_when_all.cu b/cudax/test/async/test_when_all.cu
new file mode 100755
index 0000000000..9c71dfce07
--- /dev/null
+++ b/cudax/test/async/test_when_all.cu
@@ -0,0 +1,265 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/__async/async.cuh>
+
+#include "common/checked_receiver.cuh"
+#include "common/error_scheduler.cuh"
+#include "common/impulse_scheduler.cuh"
+#include "common/stopped_scheduler.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+
+namespace
+{
+TEST_CASE("when_all simple example", "[when_all]")
+{
+  auto snd  = cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.1415));
+  auto snd1 = std::move(snd) | cudax_async::then([](int x, double y) {
+                return x + y;
+              });
+  auto op   = cudax_async::connect(std::move(snd1), checked_value_receiver{3.1415});
+  cudax_async::start(op);
+}
+
+TEST_CASE("when_all returning two values can be waited on", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2), cudax_async::just(3));
+  check_values(std::move(snd), 2, 3);
+}
+
+TEST_CASE("when_all with 5 senders", "[when_all]")
+{
+  auto snd = cudax_async::when_all(
+    cudax_async::just(2), cudax_async::just(3), cudax_async::just(5), cudax_async::just(7), cudax_async::just(11));
+  check_values(std::move(snd), 2, 3, 5, 7, 11);
+}
+
+TEST_CASE("when_all with just one sender", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2));
+  check_values(std::move(snd), 2);
+}
+
+TEST_CASE("when_all with move-only types", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(movable(2)));
+  check_values(std::move(snd), movable(2));
+}
+
+TEST_CASE("when_all with no senders", "[when_all]")
+{
+  auto snd = cudax_async::when_all();
+  check_values(std::move(snd));
+}
+
+TEST_CASE("when_all when one sender sends void", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2), cudax_async::just());
+  check_values(std::move(snd), 2);
+}
+
+#if !defined(__CUDA_ARCH__)
+
+TEST_CASE("when_all completes when children complete", "[when_all]")
+{
+  impulse_scheduler sched;
+  bool called{false};
+  auto snd = cudax_async::when_all(cudax_async::just(11) | cudax_async::continue_on(sched),
+                                   cudax_async::just(13) | cudax_async::continue_on(sched),
+                                   cudax_async::just(17) | cudax_async::continue_on(sched))
+           | cudax_async::then([&](int a, int b, int c) {
+               called = true;
+               return a + b + c;
+             });
+  auto op = cudax_async::connect(std::move(snd), checked_value_receiver{41});
+  cudax_async::start(op);
+  // The when_all scheduler will complete only after 3 impulses
+  CUDAX_CHECK_FALSE(called);
+  sched.start_next();
+  CUDAX_CHECK_FALSE(called);
+  sched.start_next();
+  CUDAX_CHECK_FALSE(called);
+  sched.start_next();
+  CUDAX_CHECK(called);
+}
+
+#endif
+
+TEST_CASE("when_all can be used with just_*", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2), cudax_async::just_error(42), cudax_async::just_stopped());
+  auto op  = cudax_async::connect(std::move(snd), checked_error_receiver{42});
+  cudax_async::start(op);
+}
+
+TEST_CASE("when_all terminates with error if one child terminates with error", "[when_all]")
+{
+  error_scheduler sched{42};
+  auto snd = cudax_async::when_all(
+    cudax_async::just(2), cudax_async::just(5) | cudax_async::continue_on(sched), cudax_async::just(7));
+  auto op = cudax_async::connect(std::move(snd), checked_error_receiver{42});
+  cudax_async::start(op);
+}
+
+TEST_CASE("when_all terminates with stopped if one child is cancelled", "[when_all]")
+{
+  stopped_scheduler sched;
+  auto snd = cudax_async::when_all(
+    cudax_async::just(2), cudax_async::just(5) | cudax_async::continue_on(sched), cudax_async::just(7));
+  auto op = cudax_async::connect(std::move(snd), checked_stopped_receiver{});
+  cudax_async::start(op);
+}
+
+#if !defined(__CUDA_ARCH__)
+
+TEST_CASE("when_all cancels remaining children if error is detected", "[when_all]")
+{
+  impulse_scheduler sched;
+  error_scheduler err_sched{42};
+  bool called1{false};
+  bool called3{false};
+  bool cancelled{false};
+  auto snd = cudax_async::when_all(
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called1 = true;
+    }),
+    cudax_async::start_on(sched, cudax_async::just(5) | cudax_async::continue_on(err_sched)),
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called3 = true;
+    }) | cudax_async::let_stopped([&] {
+      cancelled = true;
+      return cudax_async::just();
+    }));
+  auto op = cudax_async::connect(std::move(snd), checked_error_receiver{42});
+  cudax_async::start(op);
+  // The first child will complete; the third one will be cancelled
+  CUDAX_CHECK_FALSE(called1);
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the first child
+  CUDAX_CHECK(called1);
+  sched.start_next(); // start the second child; this will generate an error
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the third child
+  CUDAX_CHECK_FALSE(called3);
+  CUDAX_CHECK(cancelled);
+}
+
+TEST_CASE("when_all cancels remaining children if cancel is detected", "[when_all]")
+{
+  stopped_scheduler stopped_sched;
+  impulse_scheduler sched;
+  bool called1{false};
+  bool called3{false};
+  bool cancelled{false};
+  auto snd = cudax_async::when_all(
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called1 = true;
+    }),
+    cudax_async::start_on(sched, cudax_async::just(5) | cudax_async::continue_on(stopped_sched)),
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called3 = true;
+    }) | cudax_async::let_stopped([&] {
+      cancelled = true;
+      return cudax_async::just();
+    }));
+  auto op = cudax_async::connect(std::move(snd), checked_stopped_receiver{});
+  cudax_async::start(op);
+  // The first child will complete; the third one will be cancelled
+  CUDAX_CHECK_FALSE(called1);
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the first child
+  CUDAX_CHECK(called1);
+  sched.start_next(); // start the second child; this will call set_stopped
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the third child
+  CUDAX_CHECK_FALSE(called3);
+  CUDAX_CHECK(cancelled);
+}
+
+#endif
+
+template <class... Ts>
+struct just_ref
+{
+  using sender_concept        = cudax_async::sender_t;
+  using completion_signatures = cudax_async::completion_signatures<cudax_async::set_value_t(Ts&...)>;
+  _CCCL_HOST_DEVICE just_ref connect(cudax_async::__ignore) const
+  {
+    return {};
+  }
+};
+
+TEST_CASE("when_all has the values_type based on the children, decayed and as rvalue "
+          "references",
+          "[when_all]")
+{
+  check_value_types<types<int>>(cudax_async::when_all(cudax_async::just(13)));
+  check_value_types<types<double>>(cudax_async::when_all(cudax_async::just(3.14)));
+  check_value_types<types<int, double>>(cudax_async::when_all(cudax_async::just(3, 0.14)));
+
+  check_value_types<types<>>(cudax_async::when_all(cudax_async::just()));
+
+  check_value_types<types<int, double>>(cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.14)));
+  check_value_types<types<int, double, int, double>>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.14), cudax_async::just(1, 0.4142)));
+
+  // if one child returns void, then the value is simply missing
+  check_value_types<types<int, double>>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just(), cudax_async::just(0.14)));
+
+  // if one child has no value completion, the when_all has no value
+  // completion
+  check_value_types<>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just_stopped(), cudax_async::just(0.14)));
+
+  // if children send references, they get decayed
+  check_value_types<types<int, double>>(cudax_async::when_all(just_ref<int>(), just_ref<double>()));
+}
+
+TEST_CASE("when_all has the error_types based on the children", "[when_all]")
+{
+  check_error_types<int>(cudax_async::when_all(cudax_async::just_error(13)));
+
+  check_error_types<double>(cudax_async::when_all(cudax_async::just_error(3.14)));
+
+  check_error_types<>(cudax_async::when_all(cudax_async::just()));
+
+  check_error_types<int, double>(cudax_async::when_all(cudax_async::just_error(3), cudax_async::just_error(0.14)));
+
+  check_error_types<int, double, string>(cudax_async::when_all(
+    cudax_async::just_error(3), cudax_async::just_error(0.14), cudax_async::just_error(string{"err"})));
+
+  check_error_types<error_code>(cudax_async::when_all(
+    cudax_async::just(13),
+    cudax_async::just_error(error_code{std::errc::invalid_argument}),
+    cudax_async::just_stopped()));
+
+#if !defined(__CUDA_ARCH__)
+  // if the child sends something with a potentially throwing decay-copy,
+  // the when_all has an exception_ptr error completion.
+  check_error_types<std::exception_ptr>(cudax_async::when_all(just_ref<potentially_throwing>()));
+#else
+  // in device code, there are no exceptions:
+  check_error_types<>(cudax_async::when_all(just_ref<potentially_throwing>()));
+#endif
+}
+
+TEST_CASE("when_all has the sends_stopped == true", "[when_all]")
+{
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just(13)));
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just_error(-1)));
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just_stopped()));
+
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.14)));
+  check_sends_stopped<true>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just_error(-1), cudax_async::just_stopped()));
+}
+} // namespace
diff --git a/cudax/test/common/testing.cuh b/cudax/test/common/testing.cuh
index 64f8635b29..f70ffe0b9c 100644
--- a/cudax/test/common/testing.cuh
+++ b/cudax/test/common/testing.cuh
@@ -20,7 +20,12 @@
 #include <catch2/catch.hpp>
 #include <nv/target>
 
-namespace cudax = cuda::experimental; // NOLINT: misc-unused-alias-decls
+namespace cuda::experimental::__async
+{
+}
+
+namespace cudax       = cuda::experimental; // NOLINT: misc-unused-alias-decls
+namespace cudax_async = cuda::experimental::__async; // NOLINT: misc-unused-alias-decls
 
 #define CUDART(call) REQUIRE((call) == cudaSuccess)