From 33754cb0a4bc524f6c522bc2d515f152bdaa9173 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 19 Aug 2024 11:11:20 +0200 Subject: [PATCH] Apply CUB feedback from gevtushenko --- cub/benchmarks/bench/transform/babelstream.cu | 25 +-- cub/cub/device/device_transform.cuh | 191 +++++++++++------- .../device/dispatch/dispatch_transform.cuh | 179 ++++++++-------- cub/test/CMakeLists.txt | 4 +- cub/test/catch2_test_device_transform.cu | 28 +-- cub/test/catch2_test_device_transform_api.cu | 4 +- thrust/thrust/system/cuda/detail/transform.h | 12 +- 7 files changed, 237 insertions(+), 206 deletions(-) diff --git a/cub/benchmarks/bench/transform/babelstream.cu b/cub/benchmarks/bench/transform/babelstream.cu index 47a0380dae0..7818bd93781 100644 --- a/cub/benchmarks/bench/transform/babelstream.cu +++ b/cub/benchmarks/bench/transform/babelstream.cu @@ -72,9 +72,9 @@ auto array_size_powers = nvbench::range(16, 28, 4); template CUB_RUNTIME_FUNCTION static void bench_transform( nvbench::state& state, - OffsetT count, ::cuda::std::tuple inputs, RandomAccessIteratorOut output, + OffsetT num_items, TransformOp transform_op) { #if !TUNE_BASE @@ -84,12 +84,13 @@ CUB_RUNTIME_FUNCTION static void bench_transform( #endif state.exec(nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) { - cub::detail::transform::dispatch_t, - RandomAccessIteratorOut, - TransformOp, - policy_t>::dispatch(count, inputs, output, transform_op, launch.get_stream()); + cub::detail::transform::dispatch_t< + false, + OffsetT, + ::cuda::std::tuple, + RandomAccessIteratorOut, + TransformOp, + policy_t>::dispatch(inputs, output, num_items, transform_op, launch.get_stream()); }); } @@ -111,7 +112,7 @@ static void mul(nvbench::state& state, nvbench::type_list) state.add_global_memory_writes(n); const T scalar = startScalar; - bench_transform(state, n, ::cuda::std::tuple{c.begin()}, b.begin(), [=] __device__ __host__(const T& ci) { + bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] __device__ __host__(const T& ci) { return ci * scalar; }); } @@ -133,7 +134,7 @@ static void add(nvbench::state& state, nvbench::type_list) state.add_global_memory_reads(2 * n); state.add_global_memory_writes(n); bench_transform( - state, n, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), [] _CCCL_DEVICE(const T& ai, const T& bi) -> T { + state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T { return ai + bi; }); } @@ -156,7 +157,7 @@ static void triad(nvbench::state& state, nvbench::type_list) state.add_global_memory_writes(n); const T scalar = startScalar; bench_transform( - state, n, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), [=] _CCCL_DEVICE(const T& bi, const T& ci) { + state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) { return bi + scalar * ci; }); } @@ -180,9 +181,9 @@ static void nstream(nvbench::state& state, nvbench::type_list) const T scalar = startScalar; bench_transform( state, - n, ::cuda::std::tuple{a.begin(), b.begin(), c.begin()}, a.begin(), + n, [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) { return ai + bi + scalar * ci; }); @@ -207,7 +208,7 @@ static void heavy(nvbench::state& state) state.add_element_count(n); state.add_global_memory_reads(n); state.add_global_memory_writes(n); - bench_transform(state, n, ::cuda::std::tuple{in.begin()}, out.begin(), [=] _CCCL_DEVICE(T data) { + bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, [=] _CCCL_DEVICE(T data) { T reg[heavyness]; reg[0] = data; for (int i = 1; i < heavyness; ++i) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index de4aca46853..984109692f6 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -22,58 +22,60 @@ CUB_NAMESPACE_BEGIN //! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input -//! streams into an output stream. +//! sequences into an output sequence. struct DeviceTransform { - // Many input streams, one output stream - - /// @rst - /// Overview - /// +++++++++++++++++++++++++++++++++++++++++++++ - /// Transforms many input streams into one output stream, by applying a transformation operation on corresponding - /// input elements and writing the result to the corresponding output element. No guarantee is given on the identity - /// (i.e. address) of the objects passed to the call operator of the transformation operation. - /// - /// A Simple Example - /// +++++++++++++++++++++++++++++++++++++++++++++ - /// - /// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu - /// :language: c++ - /// :dedent: - /// :start-after: example-begin transform-many - /// :end-before: example-end transform-many - /// - /// @endrst - /// - /// @param count The number of elements in each input stream. - /// @param inputs A tuple of iterators to the input streams where count elements are read from each. The iterators' - /// value types must be trivially relocatable. - /// @param output An iterator to the output stream where count results are written to. - /// @param transform_op An n-ary function object, where n is the number of input streams. The input iterators' value - /// types must be convertible to the parameters of the function object's call operator. The return type of the call - /// operator must be assignable to the dereferenced output iterator. + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity + //! (i.e. address) of the objects passed to the call operator of the transformation operation. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin transform-many + //! :end-before: example-end transform-many + //! + //! @endrst + //! + //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The + //! iterators' value types must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. template CUB_RUNTIME_FUNCTION static cudaError_t Transform( - int count, ::cuda::std::tuple inputs, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform"); return detail::transform:: dispatch_t, RandomAccessIteratorOut, TransformOp>:: - dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream); + dispatch( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } - // temp storage overload +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB + // APIs. template CUB_RUNTIME_FUNCTION static cudaError_t Transform( void* d_temp_storage, size_t& temp_storage_bytes, - int count, ::cuda::std::tuple inputs, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { @@ -84,35 +86,50 @@ struct DeviceTransform } return Transform( - count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream); + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS - // One input stream, one output stream - + //! @rst + //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity + //! (i.e. address) of the objects passed to the call operator of the transformation operation. + //! @endrst + //! + //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type + //! must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. template CUB_RUNTIME_FUNCTION static cudaError_t Transform( - int count, RandomAccessIteratorIn input, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { return Transform( - count, ::cuda::std::make_tuple(::cuda::std::move(input)), ::cuda::std::move(output), + num_items, ::cuda::std::move(transform_op), stream); } - // temp storage overload +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB + // APIs. template CUB_RUNTIME_FUNCTION static cudaError_t Transform( void* d_temp_storage, size_t& temp_storage_bytes, - int count, RandomAccessIteratorIn input, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { @@ -123,53 +140,63 @@ struct DeviceTransform } return Transform( - count, ::cuda::std::make_tuple(::cuda::std::move(input)), ::cuda::std::move(output), + num_items, ::cuda::std::move(transform_op), stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS - // Many input streams, one output stream, address stable - - /// @rst - /// Overview - /// +++++++++++++++++++++++++++++++++++++++++++++ - /// Like \ref cub::DeviceTransform::Transform, but the objects passed to the call operator of the transformation - /// operation are guaranteed to reside in the input streams and are never copied. - /// - /// A Simple Example - /// +++++++++++++++++++++++++++++++++++++++++++++ - /// - /// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu - /// :language: c++ - /// :dedent: - /// :start-after: example-begin transform-many-stable - /// :end-before: example-end transform-many-stable - /// - /// @endrst + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. The objects passed to the call operator + //! of the transformation operation are guaranteed to reside in the input sequences and are never copied. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin transform-many-stable + //! :end-before: example-end transform-many-stable + //! + //! @endrst + //! + //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The + //! iterators' value types must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. template CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( - int count, ::cuda::std::tuple inputs, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { - CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform"); + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses"); return detail::transform:: dispatch_t, RandomAccessIteratorOut, TransformOp>:: - dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream); + dispatch( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } - // temp storage overload +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( void* d_temp_storage, size_t& temp_storage_bytes, - int count, ::cuda::std::tuple inputs, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { @@ -180,35 +207,48 @@ struct DeviceTransform } return TransformStableArgumentAddresses( - count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream); + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS - // One input stream, one output stream - + //! @rst + //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. The objects passed to the call operator + //! of the transformation operation are guaranteed to reside in the input sequences and are never copied. + //! @endrst + //! + //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type + //! must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. template CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( - int count, - RandomAccessIteratorIn inputs, + RandomAccessIteratorIn input, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { return TransformStableArgumentAddresses( - count, - ::cuda::std::make_tuple(::cuda::std::move(inputs)), + ::cuda::std::make_tuple(::cuda::std::move(input)), ::cuda::std::move(output), + num_items, ::cuda::std::move(transform_op), stream); } - // temp storage overload +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( void* d_temp_storage, size_t& temp_storage_bytes, - int count, - RandomAccessIteratorIn inputs, + RandomAccessIteratorIn input, RandomAccessIteratorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { @@ -219,12 +259,13 @@ struct DeviceTransform } return TransformStableArgumentAddresses( - count, - ::cuda::std::make_tuple(::cuda::std::move(inputs)), + ::cuda::std::make_tuple(::cuda::std::move(input)), ::cuda::std::move(output), + num_items, ::cuda::std::move(transform_op), stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index bd914e97a20..4838ef70471 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -103,7 +103,7 @@ struct prefetch_policy_t static constexpr int MAX_ITEMS_PER_THREAD = 32; }; -// prefetching out-of-bounds addresses has no side effects +// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects template _CCCL_DEVICE void prefetch(const T* addr) { @@ -112,7 +112,7 @@ _CCCL_DEVICE void prefetch(const T* addr) } // overload for any iterator that is not a pointer, do nothing -template +template ::value, int> = 0> _CCCL_DEVICE void prefetch(It) {} @@ -120,7 +120,7 @@ _CCCL_DEVICE void prefetch(It) template _CCCL_DEVICE void transform_kernel_impl( ::cuda::std::integral_constant, - Offset len, + Offset num_items, int num_elem_per_thread, F f, RandomAccessIteartorOut out, @@ -143,7 +143,7 @@ _CCCL_DEVICE void transform_kernel_impl( for (int j = 0; j < num_elem_per_thread; ++j) { const auto idx = offset + (j * blockDim.x + threadIdx.x); - if (idx < len) + if (idx < num_items) { // we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins[idx])...); @@ -167,7 +167,7 @@ template _CCCL_DEVICE void transform_kernel_impl( ::cuda::std::integral_constant, - Offset len, + Offset num_items, int, F f, RandomAccessIteartorOut out, @@ -184,7 +184,7 @@ _CCCL_DEVICE void transform_kernel_impl( for (int j = 0; j < items_per_thread; ++j) { const auto idx = offset + (j * block_dim + threadIdx.x); - if (idx < len) + if (idx < num_items) { // TODO(bgruber): replace by fold over comma in C++17 int dummy[] = {(arrays[j] = ins[idx], 0)..., 0}; // extra zero to handle empty packs @@ -196,7 +196,7 @@ _CCCL_DEVICE void transform_kernel_impl( for (int j = 0; j < items_per_thread; ++j) { const auto idx = offset + (j * block_dim + threadIdx.x); - if (idx < len) + if (idx < num_items) { out[idx] = f(arrays[j]...); } @@ -217,36 +217,34 @@ struct async_copy_policy_t // TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17. template _CCCL_DEVICE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence) - -> decltype(std::forward(f)(::cuda::std::get(std::forward(t))...)) + -> decltype(::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...)) { - return std::forward(f)(::cuda::std::get(std::forward(t))...); + return ::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...); } template _CCCL_DEVICE auto poor_apply(F&& f, Tuple&& t) -> decltype(poor_apply_impl( - std::forward(f), - std::forward(t), + ::cuda::std::forward(f), + ::cuda::std::forward(t), ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t>::value>{})) { return poor_apply_impl( - std::forward(f), - std::forward(t), + ::cuda::std::forward(f), + ::cuda::std::forward(t), ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t>::value>{}); } // mult must be a power of 2 _CCCL_HOST_DEVICE inline auto round_up_to_multiple(int x, int mult) -> int { - const auto p = cuda::std::popcount(static_cast(mult)); - assert(p == 1); - (void) p; + assert(::cuda::std::popcount(static_cast(mult)) == 1); return (x + mult - 1) & ~(mult - 1); } // TODO(bgruber): inline this as lambda in C++14 template -_CCCL_DEVICE T* copy_and_return_smem_dst( +_CCCL_DEVICE const T* copy_and_return_smem_dst( cooperative_groups::thread_block& group, int tile_size, char* smem, int& smem_offset, int global_offset, const T* ptr) { // using T = ::cuda::std::__remove_const_t<::cuda::std::__remove_pointer_t>; @@ -261,7 +259,7 @@ _CCCL_DEVICE T* copy_and_return_smem_dst( template _CCCL_DEVICE void transform_kernel_impl( ::cuda::std::integral_constant, - Offset len, + Offset num_items, int num_elem_per_thread, F f, RandomAccessIteartorOut out, @@ -270,14 +268,14 @@ _CCCL_DEVICE void transform_kernel_impl( extern __shared__ char smem[]; const Offset tile_stride = blockDim.x * num_elem_per_thread; - const Offset global_offset = std::size_t{blockIdx.x} * tile_stride; - const int tile_size = ::cuda::std::min(len - global_offset, tile_stride); + const Offset global_offset = static_cast(blockIdx.x) * tile_stride; + const int tile_size = ::cuda::std::min(num_items - global_offset, tile_stride); auto group = cooperative_groups::this_thread_block(); // TODO(bgruber): if we pass block size as template parameter, we could compute the smem offsets at compile time int smem_offset = 0; - const auto smem_ptrs = ::cuda::std::tuple{ + const auto smem_ptrs = ::cuda::std::tuple{ copy_and_return_smem_dst(group, tile_size, smem, smem_offset, global_offset, pointers)...}; cooperative_groups::wait(group); (void) smem_ptrs; // suppress unused warning for MSVC @@ -288,7 +286,7 @@ _CCCL_DEVICE void transform_kernel_impl( { const int smem_idx = i * blockDim.x + threadIdx.x; const Offset gmem_idx = global_offset + smem_idx; - if (gmem_idx < len) + if (gmem_idx < num_items) { out[gmem_idx] = poor_apply( [&](const InTs*... smem_base_ptrs) { @@ -299,21 +297,7 @@ _CCCL_DEVICE void transform_kernel_impl( } } -_CCCL_DEVICE inline bool elect_sync(const std::uint32_t& membermask) -{ - std::uint32_t is_elected; - asm volatile( - "{\n\t .reg .pred P_OUT; \n\t" - "elect.sync _|P_OUT, %1;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(is_elected) - : "r"(membermask) - :); - return static_cast(is_elected); -} - -_CCCL_HOST_DEVICE constexpr uint32_t round_up_16(std::uint32_t x) +_CCCL_HOST_DEVICE constexpr uint32_t round_up_16(::cuda::std::uint32_t x) { return (x + 15) & ~15; } @@ -321,14 +305,15 @@ _CCCL_HOST_DEVICE constexpr uint32_t round_up_16(std::uint32_t x) template _CCCL_HOST_DEVICE T* round_down_ptr_128(const T* ptr) { - constexpr auto mask = ~std::uintptr_t{128 - 1}; - return reinterpret_cast(reinterpret_cast(ptr) & mask); + constexpr auto mask = ~::cuda::std::uintptr_t{128 - 1}; + return reinterpret_cast(reinterpret_cast<::cuda::std::uintptr_t>(ptr) & mask); } template _CCCL_HOST_DEVICE int offset_to_aligned_ptr_128(const T* ptr) { - return static_cast((reinterpret_cast(ptr) & std::uintptr_t{128 - 1}) / sizeof(T)); + return static_cast( + (reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ::cuda::std::uintptr_t{128 - 1}) / sizeof(T)); } template @@ -340,17 +325,6 @@ struct ptr_set uint32_t base_offset; // Offset in number of bytes of T in ptr in ptr_base, i.e. ptr == &ptr_base[base_offset] }; -template -_CCCL_HOST_DEVICE ptr_set make_ublkcp_ptr_set(T* ptr) -{ - return ptr_set{ - round_down_ptr_128(ptr), - round_up_16(offset_to_aligned_ptr_128(ptr) * sizeof(T)), - // (ptr - ptr_base) * sizeof(T) rounded up to nearest multiple of 16 - static_cast(sizeof(T) * (ptr - round_down_ptr_128(ptr))), - }; -} - // TODO(bgruber): inline this as lambda in C++14 template _CCCL_DEVICE void copy_ptr_set( @@ -391,7 +365,7 @@ fetch_operand(uint32_t tile_stride, const char* smem, int& smem_offset, int smem template _CCCL_DEVICE void transform_kernel_impl( ::cuda::std::integral_constant, - Offset len, + Offset num_items, int num_elem_per_thread, F f, RandomAccessIteartorOut out, @@ -401,25 +375,17 @@ _CCCL_DEVICE void transform_kernel_impl( __shared__ uint64_t bar; extern __shared__ char __attribute((aligned(128))) smem[]; - namespace ptx = cuda::ptx; + namespace ptx = ::cuda::ptx; const int tile_stride = blockDim.x * num_elem_per_thread; const Offset global_offset = static_cast(blockIdx.x) * tile_stride; - const bool elected = elect_sync(~0); - - if (threadIdx.x < 32 && elected) - { + cooperative_groups::invoke_one(cooperative_groups::this_thread_block(), [&]() { // Then initialize barriers - ::cuda::ptx::mbarrier_init(&bar, 1); - ::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared); + ptx::mbarrier_init(&bar, 1); + ptx::fence_proxy_async(ptx::space_shared); - // Compute tile_size (relevant if processing last tile to not read out-of-bounds) - auto tile_size = tile_stride; - if (len < global_offset + tile_stride) - { - tile_size = len - global_offset; - } + const int tile_size = ::cuda::std::min(num_items - global_offset, Offset{tile_stride}); int smem_offset = 0; std::uint32_t total_copied = 0; @@ -433,12 +399,11 @@ _CCCL_DEVICE void transform_kernel_impl( (void) dummy; # endif - ::cuda::ptx::mbarrier_arrive_expect_tx( - ::cuda::ptx::sem_release, ::cuda::ptx::scope_cta, ::cuda::ptx::space_shared, &bar, total_copied); - } + ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied); + }); __syncthreads(); - while (!::cuda::ptx::mbarrier_try_wait_parity(&bar, 0)) + while (!ptx::mbarrier_try_wait_parity(&bar, 0)) { } // Intentionally use unroll 1. This tends to improve performance. @@ -449,7 +414,7 @@ _CCCL_DEVICE void transform_kernel_impl( // sub-expression elimination logic is smart enough to remove the redundant computations. const int smem_idx = j * blockDim.x + threadIdx.x; const Offset g_idx = global_offset + j * blockDim.x + threadIdx.x; - if (g_idx < len) + if (g_idx < num_items) { int smem_offset = 0; out[g_idx] = f(fetch_operand(tile_stride, smem, smem_offset, smem_idx, pointers)...); @@ -521,12 +486,16 @@ template __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( - Offset len, int num_elem_per_thread, F f, RandomAccessIteartorOut out, kernel_arg... ins) + Offset num_items, + int num_elem_per_thread, + F f, + RandomAccessIteartorOut out, + kernel_arg... ins) { constexpr auto alg = ::cuda::std::integral_constant{}; transform_kernel_impl( alg, - len, + num_items, num_elem_per_thread, ::cuda::std::move(f), ::cuda::std::move(out), @@ -650,8 +619,33 @@ struct PoorExpected { return reinterpret_cast(storage); } + + _CCCL_HOST_DEVICE const T& operator*() const + { + return reinterpret_cast(storage); + } }; +_CCCL_HOST_DEVICE inline PoorExpected get_max_shared_memory() +{ + // gevtushenko promised me that I can assume that stream belongs to the currently active device + int device = 0; + auto error = CubDebug(cudaGetDevice(&device)); + if (error != cudaSuccess) + { + return error; + } + + int max_smem = 0; + error = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device)); + if (error != cudaSuccess) + { + return error; + } + + return max_smem; +} + template { - Offset num_items; ::cuda::std::tuple in; RandomAccessIteratorOut out; + Offset num_items; TransformOp op; cudaStream_t stream; - int max_smem; using kernel_ptr_t = decltype(&transform_kernel max_smem) + if (smem_size > *max_smem) { break; } @@ -769,6 +768,12 @@ struct dispatch_t max_smem) + if (smem_size > *max_smem) { break; } @@ -920,7 +925,7 @@ struct dispatch_t(in)[i]...); } @@ -945,9 +950,9 @@ struct dispatch_t in, RandomAccessIteratorOut out, + Offset num_items, TransformOp op, cudaStream_t stream) { @@ -963,29 +968,13 @@ struct dispatch_t...>; - dispatch_t dispatch{ - num_items, ::cuda::std::move(in), ::cuda::std::move(out), ::cuda::std::move(op), stream, max_smem, kernel}; + dispatch_t dispatch{::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(op), stream, kernel}; return CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch)); } }; diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 0f5dce13481..9907db9db4e 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -261,8 +261,8 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists endif() - # TODO(bgruber): maybe we want to enable lambdas for all API examples - if ("${test_target}" MATCHES "test.device_transform_api") + # enable lambdas for all API examples + if ("${test_target}" MATCHES "test.[A-Za-z0-9_]+_api") target_compile_options(${test_target} PRIVATE --extended-lambda) endif() diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu index 8255fb153e6..8a07146d49c 100644 --- a/cub/test/catch2_test_device_transform.cu +++ b/cub/test/catch2_test_device_transform.cu @@ -51,9 +51,9 @@ template inputs, RandomAccessIteartorOut output, + int num_items, TransformOp transform_op, cudaStream_t stream = nullptr) { @@ -70,7 +70,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point( RandomAccessIteartorOut, TransformOp, policy_hub_for_alg>{} - .dispatch(num_items, inputs, output, transform_op, stream); + .dispatch(inputs, output, num_items, transform_op, stream); } DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many); @@ -125,7 +125,7 @@ CUB_TEST("DeviceTransform::Transform BabelStream add", c2h::device_vector result(num_items); transform_many_with_alg( - num_items, ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), ::cuda::std::plus{}); + ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus{}); // compute reference and verify c2h::host_vector a_h = a; @@ -173,7 +173,7 @@ CUB_TEST("DeviceTransform::Transform BabelStream nstream", c2h::host_vector c_h = c; transform_many_with_alg( - num_items, ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), nstream_kernel{}); + ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), num_items, nstream_kernel{}); // compute reference and verify auto z = thrust::make_zip_iterator(a_h.begin(), b_h.begin(), c_h.begin()); @@ -204,7 +204,7 @@ CUB_TEST("DeviceTransform::Transform add five streams", "[device][device_transfo c2h::device_vector result(num_items); transform_many_with_alg( - num_items, ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()), result.begin(), Sum{}); + ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()), result.begin(), num_items, Sum{}); // compute reference and verify c2h::device_vector reference(num_items, 1 + 2 + 3 + 4 + 5); @@ -223,7 +223,7 @@ CUB_TEST("DeviceTransform::Transform no streams", "[device][device_transform]") { constexpr int num_items = 100; c2h::device_vector result(num_items); - transform_many(num_items, ::cuda::std::tuple<>{}, result.begin(), GiveMeFive{}); + transform_many(::cuda::std::tuple<>{}, result.begin(), num_items, GiveMeFive{}); // compute reference and verify c2h::device_vector reference(num_items, 5); @@ -239,7 +239,7 @@ CUB_TEST("DeviceTransform::Transform fancy input iterator types", "[device][devi thrust::counting_iterator b{10}; c2h::device_vector result(num_items); - transform_many(num_items, ::cuda::std::make_tuple(a, b), result.begin(), ::cuda::std::plus{}); + transform_many(::cuda::std::make_tuple(a, b), result.begin(), num_items, ::cuda::std::plus{}); // compute reference and verify c2h::host_vector reference_h(num_items); @@ -258,7 +258,7 @@ CUB_TEST("DeviceTransform::Transform fancy output iterator type", "[device][devi c2h::device_vector a(num_items, 10); c2h::device_vector b(num_items, 10); transform_many_with_alg( - num_items, ::cuda::std::make_tuple(a.begin(), b.end()), thrust::discard_iterator<>{}, ::cuda::std::plus{}); + ::cuda::std::make_tuple(a.begin(), b.end()), thrust::discard_iterator<>{}, num_items, ::cuda::std::plus{}); } CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][device_transform]") @@ -270,7 +270,7 @@ CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][devi c2h::device_vector b(num_items, 10); c2h::device_vector result(num_items); - transform_many(num_items, ::cuda::std::make_tuple(a, b.begin()), result.begin(), ::cuda::std::plus{}); + transform_many(::cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, ::cuda::std::plus{}); // compute reference and verify c2h::host_vector b_h = b; @@ -303,9 +303,9 @@ CUB_TEST("DeviceTransform::Transform address stability", "[device][device_transf c2h::device_vector result(num_items); transform_many_stable( - num_items, ::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data())), result.begin(), + num_items, plus_needs_stable_address{thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())}); // compute reference and verify @@ -359,7 +359,7 @@ CUB_TEST("DeviceTransform::Transform not trivially relocatable", "[device][devic constexpr int num_items = 100; c2h::device_vector input(num_items, non_trivial{42}); c2h::device_vector result(num_items); - transform_many(num_items, ::cuda::std::make_tuple(input.begin()), result.begin(), ::cuda::std::negate<>{}); + transform_many(::cuda::std::make_tuple(input.begin()), result.begin(), num_items, ::cuda::std::negate<>{}); const auto reference = c2h::device_vector(num_items, non_trivial{-42}); REQUIRE((reference == result)); @@ -375,9 +375,9 @@ CUB_TEST("DeviceTransform::Transform buffer start alignment", const int offset = GENERATE(1, 2, 4, 8, 16, 32, 64, 128); // global memory is always at least 256 byte aligned c2h::device_vector input(num_items, 42); c2h::device_vector result(num_items); - transform_many(num_items - offset, - ::cuda::std::make_tuple(input.begin() + offset), + transform_many(::cuda::std::make_tuple(input.begin() + offset), result.begin() + offset, + num_items - offset, ::cuda::std::negate<>{}); auto reference = c2h::device_vector(num_items); @@ -431,7 +431,7 @@ CUB_TEST("DeviceTransform::Transform heavy functor", // c2h::gen(CUB_SEED(1), input, 1, 10); c2h::device_vector result(num_items); transform_many_with_alg( - num_items, ::cuda::std::make_tuple(input.begin()), result.begin(), heavy_functor{}); + ::cuda::std::make_tuple(input.begin()), result.begin(), num_items, heavy_functor{}); // compute reference and verify c2h::host_vector input_h = input; diff --git a/cub/test/catch2_test_device_transform_api.cu b/cub/test/catch2_test_device_transform_api.cu index 2d75454cc19..a2a3e022f31 100644 --- a/cub/test/catch2_test_device_transform_api.cu +++ b/cub/test/catch2_test_device_transform_api.cu @@ -22,7 +22,7 @@ void test_transform_api() thrust::device_vector result(num_items); cub::DeviceTransform::Transform( - num_items, ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), op); + ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), num_items, op); thrust::host_vector expected{520, 111, 397, 618}; // example-end transform-many @@ -50,7 +50,7 @@ void test_transform_stable_api() thrust::device_vector result(num_items); cub::DeviceTransform::TransformStableArgumentAddresses( - num_items, ::cuda::std::make_tuple(input1.begin(), input3), result.begin(), op); + ::cuda::std::make_tuple(input1.begin(), input3), result.begin(), num_items, op); thrust::host_vector expected{520, 111, 397, 618}; // example-end transform-many-stable diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h index 60cf4037195..624a221144d 100644 --- a/thrust/thrust/system/cuda/detail/transform.h +++ b/thrust/thrust/system/cuda/detail/transform.h @@ -242,9 +242,9 @@ _CCCL_EXEC_CHECK_DISABLE template OutputIt THRUST_FUNCTION cub_transform_many( execution_policy& policy, - Offset num_items, ::cuda::std::tuple firsts, OutputIt result, + Offset num_items, TransformOp transform_op) { if (num_items == 0) @@ -266,7 +266,7 @@ OutputIt THRUST_FUNCTION cub_transform_many( dispatch32_t::dispatch, dispatch64_t::dispatch, num_items, - (num_items_fixed, firsts, result, transform_op, cuda_cub::stream(policy))); + (firsts, result, num_items_fixed, transform_op, cuda_cub::stream(policy))); throw_on_error(status, "transform: failed inside CUB"); status = cuda_cub::synchronize_optional(policy); @@ -286,16 +286,16 @@ convert_to_std_tuple(tuple t, ::cuda::std::index_sequence) -> ::cu template OutputIt THRUST_FUNCTION cub_transform_many( execution_policy& policy, - Offset num_items, ::cuda::std::tuple>> firsts, OutputIt result, + Offset num_items, zip_function transform_op) { return cub_transform_many( policy, - num_items, convert_to_std_tuple(get<0>(firsts).get_iterator_tuple(), ::cuda::std::index_sequence_for{}), result, + num_items, transform_op.underlying_function()); } } // namespace __transform @@ -342,7 +342,7 @@ transform(execution_policy& policy, InputIt first, InputIt last, Output THRUST_CDP_DISPATCH( (using size_type = typename iterator_traits::difference_type; const auto num_items = static_cast(thrust::distance(first, last)); - return __transform::cub_transform_many(policy, num_items, ::cuda::std::make_tuple(first), result, transform_op);), + return __transform::cub_transform_many(policy, ::cuda::std::make_tuple(first), result, num_items, transform_op);), (while (first != last) { *result = transform_op(raw_reference_cast(*first)); ++first; @@ -389,7 +389,7 @@ OutputIt THRUST_FUNCTION transform( (using size_type = typename iterator_traits::difference_type; const auto num_items = static_cast(thrust::distance(first1, last1)); return __transform::cub_transform_many( - policy, num_items, ::cuda::std::make_tuple(first1, first2), result, transform_op);), + policy, ::cuda::std::make_tuple(first1, first2), result, num_items, transform_op);), (while (first1 != last1) { *result = transform_op(raw_reference_cast(*first1), raw_reference_cast(*first2)); ++first1;