From 33754cb0a4bc524f6c522bc2d515f152bdaa9173 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 19 Aug 2024 11:11:20 +0200
Subject: [PATCH] Apply CUB feedback from gevtushenko

---
 cub/benchmarks/bench/transform/babelstream.cu |  25 +--
 cub/cub/device/device_transform.cuh           | 191 +++++++++++-------
 .../device/dispatch/dispatch_transform.cuh    | 179 ++++++++--------
 cub/test/CMakeLists.txt                       |   4 +-
 cub/test/catch2_test_device_transform.cu      |  28 +--
 cub/test/catch2_test_device_transform_api.cu  |   4 +-
 thrust/thrust/system/cuda/detail/transform.h  |  12 +-
 7 files changed, 237 insertions(+), 206 deletions(-)
diff --git a/cub/benchmarks/bench/transform/babelstream.cu b/cub/benchmarks/bench/transform/babelstream.cu
index 47a0380dae0..7818bd93781 100644
--- a/cub/benchmarks/bench/transform/babelstream.cu
+++ b/cub/benchmarks/bench/transform/babelstream.cu
@@ -72,9 +72,9 @@ auto array_size_powers = nvbench::range(16, 28, 4);
 template <typename OffsetT, typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
 CUB_RUNTIME_FUNCTION static void bench_transform(
   nvbench::state& state,
-  OffsetT count,
   ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
   RandomAccessIteratorOut output,
+  OffsetT num_items,
   TransformOp transform_op)
 {
 #if !TUNE_BASE
@@ -84,12 +84,13 @@ CUB_RUNTIME_FUNCTION static void bench_transform(
 #endif
 
   state.exec(nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) {
-    cub::detail::transform::dispatch_t<false,
-                                       OffsetT,
-                                       ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-                                       RandomAccessIteratorOut,
-                                       TransformOp,
-                                       policy_t>::dispatch(count, inputs, output, transform_op, launch.get_stream());
+    cub::detail::transform::dispatch_t<
+      false,
+      OffsetT,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp,
+      policy_t>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
   });
 }
 
@@ -111,7 +112,7 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_writes<T>(n);
 
   const T scalar = startScalar;
-  bench_transform(state, n, ::cuda::std::tuple{c.begin()}, b.begin(), [=] __device__ __host__(const T& ci) {
+  bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] __device__ __host__(const T& ci) {
     return ci * scalar;
   });
 }
@@ -133,7 +134,7 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(2 * n);
   state.add_global_memory_writes<T>(n);
   bench_transform(
-    state, n, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+    state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
       return ai + bi;
     });
 }
@@ -156,7 +157,7 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_writes<T>(n);
   const T scalar = startScalar;
   bench_transform(
-    state, n, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), [=] _CCCL_DEVICE(const T& bi, const T& ci) {
+    state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
       return bi + scalar * ci;
     });
 }
@@ -180,9 +181,9 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
   const T scalar = startScalar;
   bench_transform(
     state,
-    n,
     ::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
     a.begin(),
+    n,
     [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
       return ai + bi + scalar * ci;
     });
@@ -207,7 +208,7 @@ static void heavy(nvbench::state& state)
   state.add_element_count(n);
   state.add_global_memory_reads<T>(n);
   state.add_global_memory_writes<T>(n);
-  bench_transform(state, n, ::cuda::std::tuple{in.begin()}, out.begin(), [=] _CCCL_DEVICE(T data) {
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, [=] _CCCL_DEVICE(T data) {
     T reg[heavyness];
     reg[0] = data;
     for (int i = 1; i < heavyness; ++i)
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index de4aca46853..984109692f6 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -22,58 +22,60 @@
 CUB_NAMESPACE_BEGIN
 
 //! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
-//! streams into an output stream.
+//! sequences into an output sequence.
 struct DeviceTransform
 {
-  // Many input streams, one output stream
-
-  /// @rst
-  /// Overview
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  /// Transforms many input streams into one output stream, by applying a transformation operation on corresponding
-  /// input elements and writing the result to the corresponding output element. No guarantee is given on the identity
-  /// (i.e. address) of the objects passed to the call operator of the transformation operation.
-  ///
-  /// A Simple Example
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  ///
-  /// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
-  ///     :language: c++
-  ///     :dedent:
-  ///     :start-after: example-begin transform-many
-  ///     :end-before: example-end transform-many
-  ///
-  /// @endrst
-  ///
-  /// @param count The number of elements in each input stream.
-  /// @param inputs A tuple of iterators to the input streams where count elements are read from each. The iterators'
-  /// value types must be trivially relocatable.
-  /// @param output An iterator to the output stream where count results are written to.
-  /// @param transform_op An n-ary function object, where n is the number of input streams. The input iterators' value
-  /// types must be convertible to the parameters of the function object's call operator. The return type of the call
-  /// operator must be assignable to the dereferenced output iterator.
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many
+  //!     :end-before: example-end transform-many
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
     return detail::transform::
       dispatch_t<false, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
-        dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -84,35 +86,50 @@ struct DeviceTransform
     }
 
     return Transform(
-      count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
-  // One input stream, one output stream
-
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
-    int count,
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
     return Transform(
-      count,
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -123,53 +140,63 @@ struct DeviceTransform
     }
 
     return Transform(
-      count,
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
-  // Many input streams, one output stream, address stable
-
-  /// @rst
-  /// Overview
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  /// Like \ref cub::DeviceTransform::Transform, but the objects passed to the call operator of the transformation
-  /// operation are guaranteed to reside in the input streams and are never copied.
-  ///
-  /// A Simple Example
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  ///
-  /// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
-  ///     :language: c++
-  ///     :dedent:
-  ///     :start-after: example-begin transform-many-stable
-  ///     :end-before: example-end transform-many-stable
-  ///
-  /// @endrst
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many-stable
+  //!     :end-before: example-end transform-many-stable
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
-    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
     return detail::transform::
       dispatch_t<true, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
-        dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -180,35 +207,48 @@ struct DeviceTransform
     }
 
     return TransformStableArgumentAddresses(
-      count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
-  // One input stream, one output stream
-
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
-    int count,
-    RandomAccessIteratorIn inputs,
+    RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
     return TransformStableArgumentAddresses(
-      count,
-      ::cuda::std::make_tuple(::cuda::std::move(inputs)),
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
-    RandomAccessIteratorIn inputs,
+    RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -219,12 +259,13 @@ struct DeviceTransform
     }
 
     return TransformStableArgumentAddresses(
-      count,
-      ::cuda::std::make_tuple(::cuda::std::move(inputs)),
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index bd914e97a20..4838ef70471 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -103,7 +103,7 @@ struct prefetch_policy_t
   static constexpr int MAX_ITEMS_PER_THREAD      = 32;
 };
 
-// prefetching out-of-bounds addresses has no side effects
+// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
 template <typename T>
 _CCCL_DEVICE void prefetch(const T* addr)
 {
@@ -112,7 +112,7 @@ _CCCL_DEVICE void prefetch(const T* addr)
 }
 
 // overload for any iterator that is not a pointer, do nothing
-template <typename It>
+template <typename It, ::cuda::std::__enable_if_t<!::cuda::std::is_pointer<It>::value, int> = 0>
 _CCCL_DEVICE void prefetch(It)
 {}
 
@@ -120,7 +120,7 @@ _CCCL_DEVICE void prefetch(It)
 template <typename, typename Offset, typename F, typename RandomAccessIteartorOut, typename... RandomAccessIteartorIn>
 _CCCL_DEVICE void transform_kernel_impl(
   ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>,
-  Offset len,
+  Offset num_items,
   int num_elem_per_thread,
   F f,
   RandomAccessIteartorOut out,
@@ -143,7 +143,7 @@ _CCCL_DEVICE void transform_kernel_impl(
   for (int j = 0; j < num_elem_per_thread; ++j)
   {
     const auto idx = offset + (j * blockDim.x + threadIdx.x);
-    if (idx < len)
+    if (idx < num_items)
     {
       // we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test)
       out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins[idx])...);
@@ -167,7 +167,7 @@ template <typename UnrolledPolicy,
           typename... RandomAccessIteartorIn>
 _CCCL_DEVICE void transform_kernel_impl(
   ::cuda::std::integral_constant<Algorithm, Algorithm::unrolled_staged>,
-  Offset len,
+  Offset num_items,
   int,
   F f,
   RandomAccessIteartorOut out,
@@ -184,7 +184,7 @@ _CCCL_DEVICE void transform_kernel_impl(
     for (int j = 0; j < items_per_thread; ++j)
     {
       const auto idx = offset + (j * block_dim + threadIdx.x);
-      if (idx < len)
+      if (idx < num_items)
       {
         // TODO(bgruber): replace by fold over comma in C++17
         int dummy[] = {(arrays[j] = ins[idx], 0)..., 0}; // extra zero to handle empty packs
@@ -196,7 +196,7 @@ _CCCL_DEVICE void transform_kernel_impl(
     for (int j = 0; j < items_per_thread; ++j)
     {
       const auto idx = offset + (j * block_dim + threadIdx.x);
-      if (idx < len)
+      if (idx < num_items)
       {
         out[idx] = f(arrays[j]...);
       }
@@ -217,36 +217,34 @@ struct async_copy_policy_t
 // TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17.
 template <class F, class Tuple, std::size_t... Is>
 _CCCL_DEVICE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence<Is...>)
-  -> decltype(std::forward<F>(f)(::cuda::std::get<Is>(std::forward<Tuple>(t))...))
+  -> decltype(::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...))
 {
-  return std::forward<F>(f)(::cuda::std::get<Is>(std::forward<Tuple>(t))...);
+  return ::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...);
 }
 
 template <class F, class Tuple>
 _CCCL_DEVICE auto poor_apply(F&& f, Tuple&& t)
   -> decltype(poor_apply_impl(
-    std::forward<F>(f),
-    std::forward<Tuple>(t),
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
     ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{}))
 {
   return poor_apply_impl(
-    std::forward<F>(f),
-    std::forward<Tuple>(t),
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
     ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{});
 }
 
 // mult must be a power of 2
 _CCCL_HOST_DEVICE inline auto round_up_to_multiple(int x, int mult) -> int
 {
-  const auto p = cuda::std::popcount(static_cast<unsigned>(mult));
-  assert(p == 1);
-  (void) p;
+  assert(::cuda::std::popcount(static_cast<unsigned>(mult)) == 1);
   return (x + mult - 1) & ~(mult - 1);
 }
 
 // TODO(bgruber): inline this as lambda in C++14
 template <typename T>
-_CCCL_DEVICE T* copy_and_return_smem_dst(
+_CCCL_DEVICE const T* copy_and_return_smem_dst(
   cooperative_groups::thread_block& group, int tile_size, char* smem, int& smem_offset, int global_offset, const T* ptr)
 {
   // using T          = ::cuda::std::__remove_const_t<::cuda::std::__remove_pointer_t<decltype(ptr)>>;
@@ -261,7 +259,7 @@ _CCCL_DEVICE T* copy_and_return_smem_dst(
 template <typename, typename Offset, typename F, typename RandomAccessIteartorOut, typename... InTs>
 _CCCL_DEVICE void transform_kernel_impl(
   ::cuda::std::integral_constant<Algorithm, Algorithm::memcpy_async>,
-  Offset len,
+  Offset num_items,
   int num_elem_per_thread,
   F f,
   RandomAccessIteartorOut out,
@@ -270,14 +268,14 @@ _CCCL_DEVICE void transform_kernel_impl(
   extern __shared__ char smem[];
 
   const Offset tile_stride   = blockDim.x * num_elem_per_thread;
-  const Offset global_offset = std::size_t{blockIdx.x} * tile_stride;
-  const int tile_size        = ::cuda::std::min(len - global_offset, tile_stride);
+  const Offset global_offset = static_cast<Offset>(blockIdx.x) * tile_stride;
+  const int tile_size        = ::cuda::std::min(num_items - global_offset, tile_stride);
 
   auto group = cooperative_groups::this_thread_block();
 
   // TODO(bgruber): if we pass block size as template parameter, we could compute the smem offsets at compile time
   int smem_offset      = 0;
-  const auto smem_ptrs = ::cuda::std::tuple<InTs*...>{
+  const auto smem_ptrs = ::cuda::std::tuple<const InTs*...>{
     copy_and_return_smem_dst(group, tile_size, smem, smem_offset, global_offset, pointers)...};
   cooperative_groups::wait(group);
   (void) smem_ptrs; // suppress unused warning for MSVC
@@ -288,7 +286,7 @@ _CCCL_DEVICE void transform_kernel_impl(
   {
     const int smem_idx    = i * blockDim.x + threadIdx.x;
     const Offset gmem_idx = global_offset + smem_idx;
-    if (gmem_idx < len)
+    if (gmem_idx < num_items)
     {
       out[gmem_idx] = poor_apply(
         [&](const InTs*... smem_base_ptrs) {
@@ -299,21 +297,7 @@ _CCCL_DEVICE void transform_kernel_impl(
   }
 }
 
-_CCCL_DEVICE inline bool elect_sync(const std::uint32_t& membermask)
-{
-  std::uint32_t is_elected;
-  asm volatile(
-    "{\n\t .reg .pred P_OUT; \n\t"
-    "elect.sync _|P_OUT, %1;\n\t"
-    "selp.b32 %0, 1, 0, P_OUT; \n"
-    "}"
-    : "=r"(is_elected)
-    : "r"(membermask)
-    :);
-  return static_cast<bool>(is_elected);
-}
-
-_CCCL_HOST_DEVICE constexpr uint32_t round_up_16(std::uint32_t x)
+_CCCL_HOST_DEVICE constexpr uint32_t round_up_16(::cuda::std::uint32_t x)
 {
   return (x + 15) & ~15;
 }
@@ -321,14 +305,15 @@ _CCCL_HOST_DEVICE constexpr uint32_t round_up_16(std::uint32_t x)
 template <typename T>
 _CCCL_HOST_DEVICE T* round_down_ptr_128(const T* ptr)
 {
-  constexpr auto mask = ~std::uintptr_t{128 - 1};
-  return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(ptr) & mask);
+  constexpr auto mask = ~::cuda::std::uintptr_t{128 - 1};
+  return reinterpret_cast<T*>(reinterpret_cast<::cuda::std::uintptr_t>(ptr) & mask);
 }
 
 template <typename T>
 _CCCL_HOST_DEVICE int offset_to_aligned_ptr_128(const T* ptr)
 {
-  return static_cast<int>((reinterpret_cast<std::uintptr_t>(ptr) & std::uintptr_t{128 - 1}) / sizeof(T));
+  return static_cast<int>(
+    (reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ::cuda::std::uintptr_t{128 - 1}) / sizeof(T));
 }
 
 template <typename T>
@@ -340,17 +325,6 @@ struct ptr_set
   uint32_t base_offset; // Offset in number of bytes of T in ptr in ptr_base, i.e. ptr == &ptr_base[base_offset]
 };
 
-template <typename T>
-_CCCL_HOST_DEVICE ptr_set<T> make_ublkcp_ptr_set(T* ptr)
-{
-  return ptr_set<T>{
-    round_down_ptr_128(ptr),
-    round_up_16(offset_to_aligned_ptr_128(ptr) * sizeof(T)),
-    // (ptr - ptr_base) * sizeof(T) rounded up to nearest multiple of 16
-    static_cast<uint32_t>(sizeof(T) * (ptr - round_down_ptr_128(ptr))),
-  };
-}
-
 // TODO(bgruber): inline this as lambda in C++14
 template <typename T>
 _CCCL_DEVICE void copy_ptr_set(
@@ -391,7 +365,7 @@ fetch_operand(uint32_t tile_stride, const char* smem, int& smem_offset, int smem
 template <typename, typename Offset, typename F, typename RandomAccessIteartorOut, typename... InTs>
 _CCCL_DEVICE void transform_kernel_impl(
   ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>,
-  Offset len,
+  Offset num_items,
   int num_elem_per_thread,
   F f,
   RandomAccessIteartorOut out,
@@ -401,25 +375,17 @@ _CCCL_DEVICE void transform_kernel_impl(
   __shared__ uint64_t bar;
   extern __shared__ char __attribute((aligned(128))) smem[];
 
-  namespace ptx = cuda::ptx;
+  namespace ptx = ::cuda::ptx;
 
   const int tile_stride      = blockDim.x * num_elem_per_thread;
   const Offset global_offset = static_cast<Offset>(blockIdx.x) * tile_stride;
 
-  const bool elected = elect_sync(~0);
-
-  if (threadIdx.x < 32 && elected)
-  {
+  cooperative_groups::invoke_one(cooperative_groups::this_thread_block(), [&]() {
     // Then initialize barriers
-    ::cuda::ptx::mbarrier_init(&bar, 1);
-    ::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);
+    ptx::mbarrier_init(&bar, 1);
+    ptx::fence_proxy_async(ptx::space_shared);
 
-    // Compute tile_size (relevant if processing last tile to not read out-of-bounds)
-    auto tile_size = tile_stride;
-    if (len < global_offset + tile_stride)
-    {
-      tile_size = len - global_offset;
-    }
+    const int tile_size        = ::cuda::std::min(num_items - global_offset, Offset{tile_stride});
     int smem_offset            = 0;
     std::uint32_t total_copied = 0;
 
@@ -433,12 +399,11 @@ _CCCL_DEVICE void transform_kernel_impl(
     (void) dummy;
 #  endif
 
-    ::cuda::ptx::mbarrier_arrive_expect_tx(
-      ::cuda::ptx::sem_release, ::cuda::ptx::scope_cta, ::cuda::ptx::space_shared, &bar, total_copied);
-  }
+    ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
+  });
   __syncthreads();
 
-  while (!::cuda::ptx::mbarrier_try_wait_parity(&bar, 0))
+  while (!ptx::mbarrier_try_wait_parity(&bar, 0))
   {
   }
   // Intentionally use unroll 1. This tends to improve performance.
@@ -449,7 +414,7 @@ _CCCL_DEVICE void transform_kernel_impl(
     // sub-expression elimination logic is smart enough to remove the redundant computations.
     const int smem_idx = j * blockDim.x + threadIdx.x;
     const Offset g_idx = global_offset + j * blockDim.x + threadIdx.x;
-    if (g_idx < len)
+    if (g_idx < num_items)
     {
       int smem_offset = 0;
       out[g_idx]      = f(fetch_operand(tile_stride, smem, smem_offset, smem_idx, pointers)...);
@@ -521,12 +486,16 @@ template <typename MaxPolicy,
           typename... RandomAccessIteartorsIn>
 __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
-    Offset len, int num_elem_per_thread, F f, RandomAccessIteartorOut out, kernel_arg<RandomAccessIteartorsIn>... ins)
+    Offset num_items,
+    int num_elem_per_thread,
+    F f,
+    RandomAccessIteartorOut out,
+    kernel_arg<RandomAccessIteartorsIn>... ins)
 {
   constexpr auto alg = ::cuda::std::integral_constant<Algorithm, MaxPolicy::ActivePolicy::algorithm>{};
   transform_kernel_impl<typename MaxPolicy::ActivePolicy::algo_policy>(
     alg,
-    len,
+    num_items,
     num_elem_per_thread,
     ::cuda::std::move(f),
     ::cuda::std::move(out),
@@ -650,8 +619,33 @@ struct PoorExpected
   {
     return reinterpret_cast<T&>(storage);
   }
+
+  _CCCL_HOST_DEVICE const T& operator*() const
+  {
+    return reinterpret_cast<const T&>(storage);
+  }
 };
 
+_CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
+{
+  // gevtushenko promised me that I can assume that stream belongs to the currently active device
+  int device = 0;
+  auto error = CubDebug(cudaGetDevice(&device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  int max_smem = 0;
+  error        = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  return max_smem;
+}
+
 template <bool RequiresStableAddress,
           typename Offset,
           typename RandomAccessIteratorTupleIn,
@@ -673,12 +667,11 @@ struct dispatch_t<RequiresStableAddress,
                   TransformOp,
                   PolicyHub>
 {
-  Offset num_items;
   ::cuda::std::tuple<RandomAccessIteratorsIn...> in;
   RandomAccessIteratorOut out;
+  Offset num_items;
   TransformOp op;
   cudaStream_t stream;
-  int max_smem;
 
   using kernel_ptr_t =
     decltype(&transform_kernel<typename PolicyHub::max_policy,
@@ -699,6 +692,12 @@ struct dispatch_t<RequiresStableAddress,
     using policy_t          = typename ActivePolicy::algo_policy;
     constexpr int block_dim = policy_t::BLOCK_THREADS;
 
+    const auto max_smem = get_max_shared_memory();
+    if (!max_smem)
+    {
+      return max_smem.error;
+    }
+
     // Increase the number of output elements per thread until we reach the required bytes in flight.
     int chosen_elem_per_thread = 0;
     int chosen_tile_size       = 0;
@@ -719,7 +718,7 @@ struct dispatch_t<RequiresStableAddress,
       (void) &dummy; // need to take the address to suppress unused warnings more strongly for nvcc 11.1
       (void) &count_smem;
 
-      if (smem_size > max_smem)
+      if (smem_size > *max_smem)
       {
         break;
       }
@@ -769,6 +768,12 @@ struct dispatch_t<RequiresStableAddress,
     using policy_t          = typename ActivePolicy::algo_policy;
     constexpr int block_dim = policy_t::BLOCK_THREADS;
 
+    const auto max_smem = get_max_shared_memory();
+    if (!max_smem)
+    {
+      return max_smem.error;
+    }
+
     // Increase the number of output elements per thread until we reach the required bytes in flight.
     int chosen_elem_per_thread = 0;
     int chosen_tile_size       = 0;
@@ -781,7 +786,7 @@ struct dispatch_t<RequiresStableAddress,
       const int smem_size      = tile_size * loaded_bytes_per_iter + 128 * num_inputs; // 128 bytes of padding for each
                                                                                   // input tile
 
-      if (smem_size > max_smem)
+      if (smem_size > *max_smem)
       {
         break;
       }
@@ -920,7 +925,7 @@ struct dispatch_t<RequiresStableAddress,
     RandomAccessIteratorOut out;
     mutable TransformOp op; // too many users forgot to mark there operator()'s const ...
 
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void operator()(Offset i) const
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(Offset i) const
     {
       out[i] = op(::cuda::std::get<Is>(in)[i]...);
     }
@@ -945,9 +950,9 @@ struct dispatch_t<RequiresStableAddress,
   }
 
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
-    Offset num_items,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
     RandomAccessIteratorOut out,
+    Offset num_items,
     TransformOp op,
     cudaStream_t stream)
   {
@@ -963,29 +968,13 @@ struct dispatch_t<RequiresStableAddress,
       return error;
     }
 
-    // gevtushenko promised me that I can assume that stream belongs to the currently active device
-    int device = 0;
-    error      = CubDebug(cudaGetDevice(&device));
-    if (error != cudaSuccess)
-    {
-      return error;
-    }
-
-    int max_smem = 0;
-    error        = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device));
-    if (error != cudaSuccess)
-    {
-      return error;
-    }
-
     auto kernel =
       &transform_kernel<typename PolicyHub::max_policy,
                         Offset,
                         TransformOp,
                         RandomAccessIteratorOut,
                         THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorsIn>...>;
-    dispatch_t dispatch{
-      num_items, ::cuda::std::move(in), ::cuda::std::move(out), ::cuda::std::move(op), stream, max_smem, kernel};
+    dispatch_t dispatch{::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(op), stream, kernel};
     return CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch));
   }
 };
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index 0f5dce13481..9907db9db4e 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -261,8 +261,8 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
       target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists
     endif()
 
-    # TODO(bgruber): maybe we want to enable lambdas for all API examples
-    if ("${test_target}" MATCHES "test.device_transform_api")
+    # enable lambdas for all API examples
+    if ("${test_target}" MATCHES "test.[A-Za-z0-9_]+_api")
       target_compile_options(${test_target} PRIVATE --extended-lambda)
     endif()
 
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
index 8255fb153e6..8a07146d49c 100644
--- a/cub/test/catch2_test_device_transform.cu
+++ b/cub/test/catch2_test_device_transform.cu
@@ -51,9 +51,9 @@ template <Algorithm Alg,
 CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point(
   void* d_temp_storage,
   size_t& temp_storage_bytes,
-  int num_items,
   ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
   RandomAccessIteartorOut output,
+  int num_items,
   TransformOp transform_op,
   cudaStream_t stream = nullptr)
 {
@@ -70,7 +70,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point(
                                             RandomAccessIteartorOut,
                                             TransformOp,
                                             policy_hub_for_alg<Alg>>{}
-    .dispatch(num_items, inputs, output, transform_op, stream);
+    .dispatch(inputs, output, num_items, transform_op, stream);
 }
 
 DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
@@ -125,7 +125,7 @@ CUB_TEST("DeviceTransform::Transform BabelStream add",
 
   c2h::device_vector<type> result(num_items);
   transform_many_with_alg<alg, offset_t>(
-    num_items, ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), ::cuda::std::plus<type>{});
+    ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
 
   // compute reference and verify
   c2h::host_vector<type> a_h = a;
@@ -173,7 +173,7 @@ CUB_TEST("DeviceTransform::Transform BabelStream nstream",
   c2h::host_vector<type> c_h = c;
 
   transform_many_with_alg<alg, offset_t>(
-    num_items, ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), nstream_kernel<type>{});
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), num_items, nstream_kernel<type>{});
 
   // compute reference and verify
   auto z = thrust::make_zip_iterator(a_h.begin(), b_h.begin(), c_h.begin());
@@ -204,7 +204,7 @@ CUB_TEST("DeviceTransform::Transform add five streams", "[device][device_transfo
 
   c2h::device_vector<double> result(num_items);
   transform_many_with_alg<alg, offset_t>(
-    num_items, ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()), result.begin(), Sum{});
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()), result.begin(), num_items, Sum{});
 
   // compute reference and verify
   c2h::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
@@ -223,7 +223,7 @@ CUB_TEST("DeviceTransform::Transform no streams", "[device][device_transform]")
 {
   constexpr int num_items = 100;
   c2h::device_vector<int> result(num_items);
-  transform_many(num_items, ::cuda::std::tuple<>{}, result.begin(), GiveMeFive{});
+  transform_many(::cuda::std::tuple<>{}, result.begin(), num_items, GiveMeFive{});
 
   // compute reference and verify
   c2h::device_vector<int> reference(num_items, 5);
@@ -239,7 +239,7 @@ CUB_TEST("DeviceTransform::Transform fancy input iterator types", "[device][devi
   thrust::counting_iterator<type> b{10};
 
   c2h::device_vector<type> result(num_items);
-  transform_many(num_items, ::cuda::std::make_tuple(a, b), result.begin(), ::cuda::std::plus<type>{});
+  transform_many(::cuda::std::make_tuple(a, b), result.begin(), num_items, ::cuda::std::plus<type>{});
 
   // compute reference and verify
   c2h::host_vector<type> reference_h(num_items);
@@ -258,7 +258,7 @@ CUB_TEST("DeviceTransform::Transform fancy output iterator type", "[device][devi
   c2h::device_vector<type> a(num_items, 10);
   c2h::device_vector<type> b(num_items, 10);
   transform_many_with_alg<alg, offset_t>(
-    num_items, ::cuda::std::make_tuple(a.begin(), b.end()), thrust::discard_iterator<>{}, ::cuda::std::plus<type>{});
+    ::cuda::std::make_tuple(a.begin(), b.end()), thrust::discard_iterator<>{}, num_items, ::cuda::std::plus<type>{});
 }
 
 CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][device_transform]")
@@ -270,7 +270,7 @@ CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][devi
   c2h::device_vector<type> b(num_items, 10);
 
   c2h::device_vector<type> result(num_items);
-  transform_many(num_items, ::cuda::std::make_tuple(a, b.begin()), result.begin(), ::cuda::std::plus<type>{});
+  transform_many(::cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
 
   // compute reference and verify
   c2h::host_vector<type> b_h = b;
@@ -303,9 +303,9 @@ CUB_TEST("DeviceTransform::Transform address stability", "[device][device_transf
 
   c2h::device_vector<type> result(num_items);
   transform_many_stable(
-    num_items,
     ::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data())),
     result.begin(),
+    num_items,
     plus_needs_stable_address{thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())});
 
   // compute reference and verify
@@ -359,7 +359,7 @@ CUB_TEST("DeviceTransform::Transform not trivially relocatable", "[device][devic
   constexpr int num_items = 100;
   c2h::device_vector<non_trivial> input(num_items, non_trivial{42});
   c2h::device_vector<non_trivial> result(num_items);
-  transform_many(num_items, ::cuda::std::make_tuple(input.begin()), result.begin(), ::cuda::std::negate<>{});
+  transform_many(::cuda::std::make_tuple(input.begin()), result.begin(), num_items, ::cuda::std::negate<>{});
 
   const auto reference = c2h::device_vector<non_trivial>(num_items, non_trivial{-42});
   REQUIRE((reference == result));
@@ -375,9 +375,9 @@ CUB_TEST("DeviceTransform::Transform buffer start alignment",
   const int offset        = GENERATE(1, 2, 4, 8, 16, 32, 64, 128); // global memory is always at least 256 byte aligned
   c2h::device_vector<type> input(num_items, 42);
   c2h::device_vector<type> result(num_items);
-  transform_many(num_items - offset,
-                 ::cuda::std::make_tuple(input.begin() + offset),
+  transform_many(::cuda::std::make_tuple(input.begin() + offset),
                  result.begin() + offset,
+                 num_items - offset,
                  ::cuda::std::negate<>{});
 
   auto reference = c2h::device_vector<type>(num_items);
@@ -431,7 +431,7 @@ CUB_TEST("DeviceTransform::Transform heavy functor",
   // c2h::gen(CUB_SEED(1), input, 1, 10);
   c2h::device_vector<std::uint32_t> result(num_items);
   transform_many_with_alg<alg, offset_t>(
-    num_items, ::cuda::std::make_tuple(input.begin()), result.begin(), heavy_functor<heavyness>{});
+    ::cuda::std::make_tuple(input.begin()), result.begin(), num_items, heavy_functor<heavyness>{});
 
   // compute reference and verify
   c2h::host_vector<std::uint32_t> input_h = input;
diff --git a/cub/test/catch2_test_device_transform_api.cu b/cub/test/catch2_test_device_transform_api.cu
index 2d75454cc19..a2a3e022f31 100644
--- a/cub/test/catch2_test_device_transform_api.cu
+++ b/cub/test/catch2_test_device_transform_api.cu
@@ -22,7 +22,7 @@ void test_transform_api()
 
   thrust::device_vector<int> result(num_items);
   cub::DeviceTransform::Transform(
-    num_items, ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), op);
+    ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), num_items, op);
 
   thrust::host_vector<float> expected{520, 111, 397, 618};
   // example-end transform-many
@@ -50,7 +50,7 @@ void test_transform_stable_api()
 
   thrust::device_vector<int> result(num_items);
   cub::DeviceTransform::TransformStableArgumentAddresses(
-    num_items, ::cuda::std::make_tuple(input1.begin(), input3), result.begin(), op);
+    ::cuda::std::make_tuple(input1.begin(), input3), result.begin(), num_items, op);
 
   thrust::host_vector<float> expected{520, 111, 397, 618};
   // example-end transform-many-stable
diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
index 60cf4037195..624a221144d 100644
--- a/thrust/thrust/system/cuda/detail/transform.h
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -242,9 +242,9 @@ _CCCL_EXEC_CHECK_DISABLE
 template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp>
 OutputIt THRUST_FUNCTION cub_transform_many(
   execution_policy<Derived>& policy,
-  Offset num_items,
   ::cuda::std::tuple<InputIts...> firsts,
   OutputIt result,
+  Offset num_items,
   TransformOp transform_op)
 {
   if (num_items == 0)
@@ -266,7 +266,7 @@ OutputIt THRUST_FUNCTION cub_transform_many(
     dispatch32_t::dispatch,
     dispatch64_t::dispatch,
     num_items,
-    (num_items_fixed, firsts, result, transform_op, cuda_cub::stream(policy)));
+    (firsts, result, num_items_fixed, transform_op, cuda_cub::stream(policy)));
   throw_on_error(status, "transform: failed inside CUB");
 
   status = cuda_cub::synchronize_optional(policy);
@@ -286,16 +286,16 @@ convert_to_std_tuple(tuple<Ts...> t, ::cuda::std::index_sequence<Is...>) -> ::cu
 template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp>
 OutputIt THRUST_FUNCTION cub_transform_many(
   execution_policy<Derived>& policy,
-  Offset num_items,
   ::cuda::std::tuple<zip_iterator<tuple<InputIts...>>> firsts,
   OutputIt result,
+  Offset num_items,
   zip_function<TransformOp> transform_op)
 {
   return cub_transform_many(
     policy,
-    num_items,
     convert_to_std_tuple(get<0>(firsts).get_iterator_tuple(), ::cuda::std::index_sequence_for<InputIts...>{}),
     result,
+    num_items,
     transform_op.underlying_function());
 }
 } // namespace __transform
@@ -342,7 +342,7 @@ transform(execution_policy<Derived>& policy, InputIt first, InputIt last, Output
   THRUST_CDP_DISPATCH(
     (using size_type      = typename iterator_traits<InputIt>::difference_type;
      const auto num_items = static_cast<size_type>(thrust::distance(first, last));
-     return __transform::cub_transform_many(policy, num_items, ::cuda::std::make_tuple(first), result, transform_op);),
+     return __transform::cub_transform_many(policy, ::cuda::std::make_tuple(first), result, num_items, transform_op);),
     (while (first != last) {
       *result = transform_op(raw_reference_cast(*first));
       ++first;
@@ -389,7 +389,7 @@ OutputIt THRUST_FUNCTION transform(
     (using size_type      = typename iterator_traits<InputIt1>::difference_type;
      const auto num_items = static_cast<size_type>(thrust::distance(first1, last1));
      return __transform::cub_transform_many(
-       policy, num_items, ::cuda::std::make_tuple(first1, first2), result, transform_op);),
+       policy, ::cuda::std::make_tuple(first1, first2), result, num_items, transform_op);),
     (while (first1 != last1) {
       *result = transform_op(raw_reference_cast(*first1), raw_reference_cast(*first2));
       ++first1;