From 4a32b1c28ba24dcfe56ef7bbe7f3990e06dd1f9a Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Fri, 6 Sep 2024 07:22:50 -0700 Subject: [PATCH] [CUDAX] add a small c++17 implementation of `std::execution` (aka P2300) (#2301) --- .clang-format | 3 + cudax/cmake/cudaxHeaderTesting.cmake | 6 + cudax/cmake/header_test.in.cu | 4 +- .../cuda/experimental/__async/async.cuh | 48 ++ .../experimental/__async/basic_sender.cuh | 255 ++++++ .../__async/completion_signatures.cuh | 336 ++++++++ .../cuda/experimental/__async/conditional.cuh | 238 ++++++ .../cuda/experimental/__async/config.cuh | 46 ++ .../cuda/experimental/__async/continue_on.cuh | 288 +++++++ .../cuda/experimental/__async/cpos.cuh | 220 +++++ .../include/cuda/experimental/__async/env.cuh | 192 +++++ .../cuda/experimental/__async/epilogue.cuh | 16 + .../cuda/experimental/__async/exception.cuh | 46 ++ .../cuda/experimental/__async/fwd_rcvr.cuh | 71 ++ .../cuda/experimental/__async/just.cuh | 134 ++++ .../cuda/experimental/__async/just_from.cuh | 163 ++++ .../cuda/experimental/__async/lazy.cuh | 158 ++++ .../cuda/experimental/__async/let_value.cuh | 326 ++++++++ .../cuda/experimental/__async/meta.cuh | 753 ++++++++++++++++++ .../cuda/experimental/__async/prologue.cuh | 20 + .../cuda/experimental/__async/queries.cuh | 167 ++++ .../cuda/experimental/__async/rcvr_ref.cuh | 51 ++ .../experimental/__async/rcvr_with_env.cuh | 141 ++++ .../cuda/experimental/__async/read_env.cuh | 155 ++++ .../cuda/experimental/__async/run_loop.cuh | 274 +++++++ .../cuda/experimental/__async/sequence.cuh | 151 ++++ .../experimental/__async/start_detached.cuh | 104 +++ .../cuda/experimental/__async/start_on.cuh | 150 ++++ .../cuda/experimental/__async/stop_token.cuh | 488 ++++++++++++ .../cuda/experimental/__async/sync_wait.cuh | 207 +++++ .../cuda/experimental/__async/then.cuh | 303 +++++++ .../cuda/experimental/__async/thread.cuh | 85 ++ .../experimental/__async/thread_context.cuh | 73 ++ .../cuda/experimental/__async/tuple.cuh | 104 +++ .../cuda/experimental/__async/type_traits.cuh | 258 ++++++ .../cuda/experimental/__async/utility.cuh | 208 +++++ .../cuda/experimental/__async/variant.cuh | 192 +++++ .../cuda/experimental/__async/when_all.cuh | 650 +++++++++++++++ .../cuda/experimental/__async/write_env.cuh | 118 +++ cudax/test/CMakeLists.txt | 12 + cudax/test/async/common/checked_receiver.cuh | 125 +++ cudax/test/async/common/error_scheduler.cuh | 101 +++ cudax/test/async/common/impulse_scheduler.cuh | 200 +++++ cudax/test/async/common/inline_scheduler.cuh | 81 ++ cudax/test/async/common/stopped_scheduler.cuh | 85 ++ cudax/test/async/common/utility.cuh | 189 +++++ cudax/test/async/test_conditional.cu | 61 ++ cudax/test/async/test_continue_on.cu | 220 +++++ cudax/test/async/test_just.cu | 18 + cudax/test/async/test_sequence.cu | 54 ++ cudax/test/async/test_when_all.cu | 265 ++++++ cudax/test/common/testing.cuh | 7 +- 52 files changed, 8618 insertions(+), 2 deletions(-) create mode 100644 cudax/include/cuda/experimental/__async/async.cuh create mode 100644 cudax/include/cuda/experimental/__async/basic_sender.cuh create mode 100644 cudax/include/cuda/experimental/__async/completion_signatures.cuh create mode 100644 cudax/include/cuda/experimental/__async/conditional.cuh create mode 100644 cudax/include/cuda/experimental/__async/config.cuh create mode 100644 cudax/include/cuda/experimental/__async/continue_on.cuh create mode 100644 cudax/include/cuda/experimental/__async/cpos.cuh create mode 100644 cudax/include/cuda/experimental/__async/env.cuh create mode 100644 cudax/include/cuda/experimental/__async/epilogue.cuh create mode 100644 cudax/include/cuda/experimental/__async/exception.cuh create mode 100644 cudax/include/cuda/experimental/__async/fwd_rcvr.cuh create mode 100644 cudax/include/cuda/experimental/__async/just.cuh create mode 100644 cudax/include/cuda/experimental/__async/just_from.cuh create mode 100644 cudax/include/cuda/experimental/__async/lazy.cuh create mode 100644 cudax/include/cuda/experimental/__async/let_value.cuh create mode 100644 cudax/include/cuda/experimental/__async/meta.cuh create mode 100644 cudax/include/cuda/experimental/__async/prologue.cuh create mode 100644 cudax/include/cuda/experimental/__async/queries.cuh create mode 100644 cudax/include/cuda/experimental/__async/rcvr_ref.cuh create mode 100644 cudax/include/cuda/experimental/__async/rcvr_with_env.cuh create mode 100644 cudax/include/cuda/experimental/__async/read_env.cuh create mode 100644 cudax/include/cuda/experimental/__async/run_loop.cuh create mode 100644 cudax/include/cuda/experimental/__async/sequence.cuh create mode 100644 cudax/include/cuda/experimental/__async/start_detached.cuh create mode 100644 cudax/include/cuda/experimental/__async/start_on.cuh create mode 100644 cudax/include/cuda/experimental/__async/stop_token.cuh create mode 100644 cudax/include/cuda/experimental/__async/sync_wait.cuh create mode 100644 cudax/include/cuda/experimental/__async/then.cuh create mode 100644 cudax/include/cuda/experimental/__async/thread.cuh create mode 100644 cudax/include/cuda/experimental/__async/thread_context.cuh create mode 100644 cudax/include/cuda/experimental/__async/tuple.cuh create mode 100644 cudax/include/cuda/experimental/__async/type_traits.cuh create mode 100644 cudax/include/cuda/experimental/__async/utility.cuh create mode 100644 cudax/include/cuda/experimental/__async/variant.cuh create mode 100644 cudax/include/cuda/experimental/__async/when_all.cuh create mode 100644 cudax/include/cuda/experimental/__async/write_env.cuh create mode 100755 cudax/test/async/common/checked_receiver.cuh create mode 100755 cudax/test/async/common/error_scheduler.cuh create mode 100755 cudax/test/async/common/impulse_scheduler.cuh create mode 100755 cudax/test/async/common/inline_scheduler.cuh create mode 100755 cudax/test/async/common/stopped_scheduler.cuh create mode 100755 cudax/test/async/common/utility.cuh create mode 100755 cudax/test/async/test_conditional.cu create mode 100755 cudax/test/async/test_continue_on.cu create mode 100755 cudax/test/async/test_just.cu create mode 100755 cudax/test/async/test_sequence.cu create mode 100755 cudax/test/async/test_when_all.cu diff --git a/.clang-format b/.clang-format index 974957913f..6cba3dca4b 100644 --- a/.clang-format +++ b/.clang-format @@ -90,6 +90,9 @@ IfMacros: [ IndentWrappedFunctionNames: false IncludeBlocks: Regroup IncludeCategories: + - Regex: '^' + Priority: 0x7FFFFFFF + SortPriority: 0x7FFFFFFF - Regex: '^<(cuda/std/detail/__config|cub/config.cuh|thrust/detail/config.h|thrust/system/cuda/config.h)' Priority: 0 SortPriority: 0 diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake index 29a3bd58ca..824b1a4fda 100644 --- a/cudax/cmake/cudaxHeaderTesting.cmake +++ b/cudax/cmake/cudaxHeaderTesting.cmake @@ -14,6 +14,12 @@ file(GLOB_RECURSE headers "${cudax_SOURCE_DIR}/include/*.h" ) +# The following internal headers are not required to compile independently: +list(REMOVE_ITEM headers + "cuda/experimental/__async/prologue.cuh" + "cuda/experimental/__async/epilogue.cuh" +) + set(headertest_srcs) foreach (header IN LISTS headers) set(headertest_src "headers/${header}.cu") diff --git a/cudax/cmake/header_test.in.cu b/cudax/cmake/header_test.in.cu index 771ca319db..fd2df1987d 100644 --- a/cudax/cmake/header_test.in.cu +++ b/cudax/cmake/header_test.in.cu @@ -34,7 +34,9 @@ #define I CUDAX_MACRO_CHECK('I', complex.h) // windows.h conflicts -#define small CUDAX_MACRO_CHECK('small', windows.h) +// @eniebler 2024-08-30: This test is disabled because it causes build +// failures in some configurations. +// #define small CUDAX_MACRO_CHECK('small', windows.h) // We can't enable these checks without breaking some builds -- some standard // library implementations unconditionally `#undef` these macros, which then // causes random failures later. diff --git a/cudax/include/cuda/experimental/__async/async.cuh b/cudax/include/cuda/experimental/__async/async.cuh new file mode 100644 index 0000000000..ed53717bca --- /dev/null +++ b/cudax/include/cuda/experimental/__async/async.cuh @@ -0,0 +1,48 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ASYNC_DETAIL_ASYNC +#define __CUDAX_ASYNC_DETAIL_ASYNC + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// Include this first +#include + +// Include the other implementation headers: +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#endif // __CUDAX_ASYNC_DETAIL_ASYNC diff --git a/cudax/include/cuda/experimental/__async/basic_sender.cuh b/cudax/include/cuda/experimental/__async/basic_sender.cuh new file mode 100644 index 0000000000..5730078ecc --- /dev/null +++ b/cudax/include/cuda/experimental/__async/basic_sender.cuh @@ -0,0 +1,255 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ASYNC_DETAIL_BASIC_SENDER +#define __CUDAX_ASYNC_DETAIL_BASIC_SENDER + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include +#include +#include + +#include + +namespace cuda::experimental::__async +{ +template +struct __state +{ + _Data __data_; + _Rcvr __receiver_; +}; + +struct receiver_defaults +{ + using receiver_concept = __async::receiver_t; + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_value(__ignore, _Rcvr& __rcvr, _Args&&... __args) noexcept + -> __async::completion_signatures<__async::set_value_t(_Args...)> + { + __async::set_value(static_cast<_Rcvr&&>(__rcvr), static_cast<_Args&&>(__args)...); + return {}; + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_error(__ignore, _Rcvr& __rcvr, _Error&& __error) noexcept + -> __async::completion_signatures<__async::set_error_t(_Error)> + { + __async::set_error(static_cast<_Rcvr&&>(__rcvr), static_cast<_Error&&>(__error)); + return {}; + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto + set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()> + { + __async::set_stopped(static_cast<_Rcvr&&>(__rcvr)); + return {}; + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static decltype(auto) get_env(__ignore, const _Rcvr& __rcvr) noexcept + { + return __async::get_env(__rcvr); + } +}; + +template +struct basic_receiver +{ + using receiver_concept = __async::receiver_t; + using __rcvr_t = typename _Data::receiver_tag; + __state<_Data, _Rcvr>& __state_; + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_Args&&... __args) noexcept + { + __rcvr_t::set_value(__state_.__data_, __state_.__receiver_, (_Args&&) __args...); + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept + { + __rcvr_t::set_error(__state_.__data_, __state_.__receiver_, (_Error&&) __error); + } + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept + { + __rcvr_t::set_stopped(__state_.__data_, __state_.__receiver_); + } + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept + { + return __rcvr_t::get_env(__state_.__data_, __state_.__receiver_); + } +}; + +template +_CCCL_INLINE_VAR constexpr bool has_no_environment = _CUDA_VSTD::is_same_v<_Rcvr, receiver_archetype>; + +template +struct __mk_completions +{ + using __rcvr_t = typename _Data::receiver_tag; + + template + using __set_value_t = + decltype(+*__rcvr_t::set_value(__declval<_Data&>(), __declval(), __declval<_Args>()...)); + + template + using __set_error_t = + decltype(+*__rcvr_t::set_error(__declval<_Data&>(), __declval(), __declval<_Error>())); + + using __set_stopped_t = __async::completion_signatures<>; +}; + +template +struct __mk_completions : __mk_completions +{ + using __rcvr_t = typename _Data::receiver_tag; + + using __set_stopped_t = decltype(+*__rcvr_t::set_stopped(__declval<_Data&>(), __declval())); +}; + +template +using __ignore_value_signature = __async::completion_signatures<>; + +template +using __ignore_error_signature = __async::completion_signatures<>; + +template +constexpr bool __has_stopped = + !_CUDA_VSTD::is_same_v<__async::completion_signatures<>, + __async::transform_completion_signatures<_Completions, + __async::completion_signatures<>, + __ignore_value_signature, + __ignore_error_signature>>; + +template +void set_current_exception_if([[maybe_unused]] _Rcvr& __rcvr) noexcept +{ + if constexpr (_PotentiallyThrowing) + { + __async::set_error(static_cast<_Rcvr&&>(__rcvr), ::std::current_exception()); + } +} + +// A generic type that holds the data for an async operation, and +// that provides a `start` method for enqueuing the work. +template +struct __basic_opstate +{ + using __rcvr_t = basic_receiver<_Data, _Rcvr>; + using __completions_t = completion_signatures_of_t<_Sndr, __rcvr_t>; + using __traits_t = __mk_completions<__has_stopped<__completions_t>, _Data, _Rcvr>; + + using completion_signatures = // + transform_completion_signatures<__completions_t, + // TODO: add set_error_t(exception_ptr) if constructing + // the state or connecting the sender is potentially throwing. + __async::completion_signatures<>, + __traits_t::template __set_value_t, + __traits_t::template __set_error_t, + typename __traits_t::__set_stopped_t>; + + _CCCL_HOST_DEVICE __basic_opstate(_Sndr&& __sndr, _Data __data, _Rcvr __rcvr) + : __state_{static_cast<_Data&&>(__data), static_cast<_Rcvr&&>(__rcvr)} + , __op_(__async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{__state_})) + {} + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void start() noexcept + { + __async::start(__op_); + } + + __state<_Data, _Rcvr> __state_; + __async::connect_result_t<_Sndr, __rcvr_t> __op_; +}; + +template +_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr) +{ + auto [__tag, __data, __child] = static_cast<_Sndr&&>(__sndr); + using __data_t = decltype(__data); + using __child_t = decltype(__child); + (void) __tag; + return __basic_opstate( + static_cast<__child_t&&>(__child), static_cast<__data_t&&>(__data), static_cast<_Rcvr&&>(__rcvr)); +} + +template +_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto +__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...)) +{ + return __data.get_attrs(__sndrs...); +} + +template +_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto +__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...)) +{ + return __async::get_env(__sndrs...); +} + +template +struct basic_sender; + +template +struct basic_sender<_Data, _Sndr> +{ + using sender_concept = __async::sender_t; + using __tag_t = typename _Data::sender_tag; + using __rcvr_t = typename _Data::receiver_tag; + + _CCCL_NO_UNIQUE_ADDRESS __tag_t __tag_; + _Data __data_; + _Sndr __sndr_; + + // Connect the sender to the receiver (the continuation) and + // return the state_type object for this operation. + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) && + { + return __make_opstate(static_cast(*this), static_cast<_Rcvr&&>(__rcvr)); + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) const& + { + return __make_opstate(*this, static_cast<_Rcvr&&>(__rcvr)); + } + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept + { + return __async::__get_attrs(0, __data_, __sndr_); + } +}; + +template +basic_sender(__ignore, _Data, _Sndrs...) -> basic_sender<_Data, _Sndrs...>; + +} // namespace cuda::experimental::__async + +#include + +#endif diff --git a/cudax/include/cuda/experimental/__async/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/completion_signatures.cuh new file mode 100644 index 0000000000..c4edf4b618 --- /dev/null +++ b/cudax/include/cuda/experimental/__async/completion_signatures.cuh @@ -0,0 +1,336 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES +#define __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +#include + +namespace cuda::experimental::__async +{ +// A typelist for completion signatures +template +struct completion_signatures +{}; + +// A metafunction to determine if a type is a completion signature +template +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = false; + +template +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = true; + +template +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = true; + +template <> +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = true; + +// The implementation of transform_completion_signatures starts here +template class _Vy, template class _Ey, class _Sy> +extern __undefined<_Sig> __transform_sig; + +template class _Vy, template class _Ey, class _Sy> +extern __fn_t<_Vy<_Values...>>* __transform_sig; + +template class _Vy, template class _Ey, class _Sy> +extern __fn_t<_Ey<_Error>>* __transform_sig; + +template