Apply CUB feedback from gevtushenko

NVIDIA · Aug 19, 2024 · 33754cb · 33754cb
1 parent eeb0202
commit 33754cb
Show file tree

Hide file tree

Showing 7 changed files with 237 additions and 206 deletions.
diff --git a/cub/benchmarks/bench/transform/babelstream.cu b/cub/benchmarks/bench/transform/babelstream.cu
@@ -72,9 +72,9 @@ auto array_size_powers = nvbench::range(16, 28, 4);
 template <typename OffsetT, typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
 CUB_RUNTIME_FUNCTION static void bench_transform(
   nvbench::state& state,
-  OffsetT count,
   ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
   RandomAccessIteratorOut output,
+  OffsetT num_items,
   TransformOp transform_op)
 {
 #if !TUNE_BASE
@@ -84,12 +84,13 @@ CUB_RUNTIME_FUNCTION static void bench_transform(
 #endif
 
   state.exec(nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) {
-    cub::detail::transform::dispatch_t<false,
-                                       OffsetT,
-                                       ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-                                       RandomAccessIteratorOut,
-                                       TransformOp,
-                                       policy_t>::dispatch(count, inputs, output, transform_op, launch.get_stream());
+    cub::detail::transform::dispatch_t<
+      false,
+      OffsetT,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp,
+      policy_t>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
   });
 }
 
@@ -111,7 +112,7 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_writes<T>(n);
 
   const T scalar = startScalar;
-  bench_transform(state, n, ::cuda::std::tuple{c.begin()}, b.begin(), [=] __device__ __host__(const T& ci) {
+  bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] __device__ __host__(const T& ci) {
     return ci * scalar;
   });
 }
@@ -133,7 +134,7 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(2 * n);
   state.add_global_memory_writes<T>(n);
   bench_transform(
-    state, n, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+    state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
       return ai + bi;
     });
 }
@@ -156,7 +157,7 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_writes<T>(n);
   const T scalar = startScalar;
   bench_transform(
-    state, n, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), [=] _CCCL_DEVICE(const T& bi, const T& ci) {
+    state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
       return bi + scalar * ci;
     });
 }
@@ -180,9 +181,9 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
   const T scalar = startScalar;
   bench_transform(
     state,
-    n,
     ::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
     a.begin(),
+    n,
     [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
       return ai + bi + scalar * ci;
     });
@@ -207,7 +208,7 @@ static void heavy(nvbench::state& state)
   state.add_element_count(n);
   state.add_global_memory_reads<T>(n);
   state.add_global_memory_writes<T>(n);
-  bench_transform(state, n, ::cuda::std::tuple{in.begin()}, out.begin(), [=] _CCCL_DEVICE(T data) {
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, [=] _CCCL_DEVICE(T data) {
     T reg[heavyness];
     reg[0] = data;
     for (int i = 1; i < heavyness; ++i)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
@@ -22,58 +22,60 @@
 CUB_NAMESPACE_BEGIN
 
 //! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
-//! streams into an output stream.
+//! sequences into an output sequence.
 struct DeviceTransform
 {
-  // Many input streams, one output stream
-
-  /// @rst
-  /// Overview
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  /// Transforms many input streams into one output stream, by applying a transformation operation on corresponding
-  /// input elements and writing the result to the corresponding output element. No guarantee is given on the identity
-  /// (i.e. address) of the objects passed to the call operator of the transformation operation.
-  ///
-  /// A Simple Example
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  ///
-  /// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
-  ///     :language: c++
-  ///     :dedent:
-  ///     :start-after: example-begin transform-many
-  ///     :end-before: example-end transform-many
-  ///
-  /// @endrst
-  ///
-  /// @param count The number of elements in each input stream.
-  /// @param inputs A tuple of iterators to the input streams where count elements are read from each. The iterators'
-  /// value types must be trivially relocatable.
-  /// @param output An iterator to the output stream where count results are written to.
-  /// @param transform_op An n-ary function object, where n is the number of input streams. The input iterators' value
-  /// types must be convertible to the parameters of the function object's call operator. The return type of the call
-  /// operator must be assignable to the dereferenced output iterator.
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many
+  //!     :end-before: example-end transform-many
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
     return detail::transform::
       dispatch_t<false, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
-        dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -84,35 +86,50 @@ struct DeviceTransform
     }
 
     return Transform(
-      count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
-  // One input stream, one output stream
-
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
-    int count,
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
     return Transform(
-      count,
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -123,53 +140,63 @@ struct DeviceTransform
     }
 
     return Transform(
-      count,
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
-  // Many input streams, one output stream, address stable
-
-  /// @rst
-  /// Overview
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  /// Like \ref cub::DeviceTransform::Transform, but the objects passed to the call operator of the transformation
-  /// operation are guaranteed to reside in the input streams and are never copied.
-  ///
-  /// A Simple Example
-  /// +++++++++++++++++++++++++++++++++++++++++++++
-  ///
-  /// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
-  ///     :language: c++
-  ///     :dedent:
-  ///     :start-after: example-begin transform-many-stable
-  ///     :end-before: example-end transform-many-stable
-  ///
-  /// @endrst
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many-stable
+  //!     :end-before: example-end transform-many-stable
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
-    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
     return detail::transform::
       dispatch_t<true, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
-        dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -180,35 +207,48 @@ struct DeviceTransform
     }
 
     return TransformStableArgumentAddresses(
-      count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
-  // One input stream, one output stream
-
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
-    int count,
-    RandomAccessIteratorIn inputs,
+    RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
     return TransformStableArgumentAddresses(
-      count,
-      ::cuda::std::make_tuple(::cuda::std::move(inputs)),
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
 
-  // temp storage overload
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    int count,
-    RandomAccessIteratorIn inputs,
+    RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
+    int num_items,
     TransformOp transform_op,
     cudaStream_t stream = nullptr)
   {
@@ -219,12 +259,13 @@ struct DeviceTransform
     }
 
     return TransformStableArgumentAddresses(
-      count,
-      ::cuda::std::make_tuple(::cuda::std::move(inputs)),
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
+      num_items,
       ::cuda::std::move(transform_op),
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END