Skip to content

Commit

Permalink
Apply CUB feedback from gevtushenko
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Aug 19, 2024
1 parent eeb0202 commit 33754cb
Show file tree
Hide file tree
Showing 7 changed files with 237 additions and 206 deletions.
25 changes: 13 additions & 12 deletions cub/benchmarks/bench/transform/babelstream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ auto array_size_powers = nvbench::range(16, 28, 4);
template <typename OffsetT, typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static void bench_transform(
nvbench::state& state,
OffsetT count,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
OffsetT num_items,
TransformOp transform_op)
{
#if !TUNE_BASE
Expand All @@ -84,12 +84,13 @@ CUB_RUNTIME_FUNCTION static void bench_transform(
#endif

state.exec(nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) {
cub::detail::transform::dispatch_t<false,
OffsetT,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
TransformOp,
policy_t>::dispatch(count, inputs, output, transform_op, launch.get_stream());
cub::detail::transform::dispatch_t<
false,
OffsetT,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
TransformOp,
policy_t>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
});
}

Expand All @@ -111,7 +112,7 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(n);

const T scalar = startScalar;
bench_transform(state, n, ::cuda::std::tuple{c.begin()}, b.begin(), [=] __device__ __host__(const T& ci) {
bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] __device__ __host__(const T& ci) {
return ci * scalar;
});
}
Expand All @@ -133,7 +134,7 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
bench_transform(
state, n, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
return ai + bi;
});
}
Expand All @@ -156,7 +157,7 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(n);
const T scalar = startScalar;
bench_transform(
state, n, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), [=] _CCCL_DEVICE(const T& bi, const T& ci) {
state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
return bi + scalar * ci;
});
}
Expand All @@ -180,9 +181,9 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
const T scalar = startScalar;
bench_transform(
state,
n,
::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
a.begin(),
n,
[=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
});
Expand All @@ -207,7 +208,7 @@ static void heavy(nvbench::state& state)
state.add_element_count(n);
state.add_global_memory_reads<T>(n);
state.add_global_memory_writes<T>(n);
bench_transform(state, n, ::cuda::std::tuple{in.begin()}, out.begin(), [=] _CCCL_DEVICE(T data) {
bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, [=] _CCCL_DEVICE(T data) {
T reg[heavyness];
reg[0] = data;
for (int i = 1; i < heavyness; ++i)
Expand Down
191 changes: 116 additions & 75 deletions cub/cub/device/device_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,58 +22,60 @@
CUB_NAMESPACE_BEGIN

//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
//! streams into an output stream.
//! sequences into an output sequence.
struct DeviceTransform
{
// Many input streams, one output stream

/// @rst
/// Overview
/// +++++++++++++++++++++++++++++++++++++++++++++
/// Transforms many input streams into one output stream, by applying a transformation operation on corresponding
/// input elements and writing the result to the corresponding output element. No guarantee is given on the identity
/// (i.e. address) of the objects passed to the call operator of the transformation operation.
///
/// A Simple Example
/// +++++++++++++++++++++++++++++++++++++++++++++
///
/// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
/// :language: c++
/// :dedent:
/// :start-after: example-begin transform-many
/// :end-before: example-end transform-many
///
/// @endrst
///
/// @param count The number of elements in each input stream.
/// @param inputs A tuple of iterators to the input streams where count elements are read from each. The iterators'
/// value types must be trivially relocatable.
/// @param output An iterator to the output stream where count results are written to.
/// @param transform_op An n-ary function object, where n is the number of input streams. The input iterators' value
/// types must be convertible to the parameters of the function object's call operator. The return type of the call
/// operator must be assignable to the dereferenced output iterator.
//! @rst
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
//! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
//! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
//! (i.e. address) of the objects passed to the call operator of the transformation operation.
//!
//! A Simple Example
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin transform-many
//! :end-before: example-end transform-many
//!
//! @endrst
//!
//! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
//! iterators' value types must be trivially relocatable.
//! @param output An iterator to the output sequence where num_items results are written to.
//! @param num_items The number of elements in each input sequence.
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
//! operator must be assignable to the dereferenced output iterator.
//! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
int count,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
return detail::transform::
dispatch_t<false, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
dispatch(
::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
}

// temp storage overload
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
// This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
// APIs.
template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
void* d_temp_storage,
size_t& temp_storage_bytes,
int count,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
Expand All @@ -84,35 +86,50 @@ struct DeviceTransform
}

return Transform(
count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS

// One input stream, one output stream

//! @rst
//! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
//! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
//! (i.e. address) of the objects passed to the call operator of the transformation operation.
//! @endrst
//!
//! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
//! must be trivially relocatable.
//! @param output An iterator to the output sequence where num_items results are written to.
//! @param num_items The number of elements in each input sequence.
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
//! operator must be assignable to the dereferenced output iterator.
//! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
int count,
RandomAccessIteratorIn input,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
return Transform(
count,
::cuda::std::make_tuple(::cuda::std::move(input)),
::cuda::std::move(output),
num_items,
::cuda::std::move(transform_op),
stream);
}

// temp storage overload
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
// This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
// APIs.
template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
void* d_temp_storage,
size_t& temp_storage_bytes,
int count,
RandomAccessIteratorIn input,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
Expand All @@ -123,53 +140,63 @@ struct DeviceTransform
}

return Transform(
count,
::cuda::std::make_tuple(::cuda::std::move(input)),
::cuda::std::move(output),
num_items,
::cuda::std::move(transform_op),
stream);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS

// Many input streams, one output stream, address stable

/// @rst
/// Overview
/// +++++++++++++++++++++++++++++++++++++++++++++
/// Like \ref cub::DeviceTransform::Transform, but the objects passed to the call operator of the transformation
/// operation are guaranteed to reside in the input streams and are never copied.
///
/// A Simple Example
/// +++++++++++++++++++++++++++++++++++++++++++++
///
/// .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
/// :language: c++
/// :dedent:
/// :start-after: example-begin transform-many-stable
/// :end-before: example-end transform-many-stable
///
/// @endrst
//! @rst
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
//! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
//! input elements and writing the result to the corresponding output element. The objects passed to the call operator
//! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
//!
//! A Simple Example
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin transform-many-stable
//! :end-before: example-end transform-many-stable
//!
//! @endrst
//!
//! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
//! iterators' value types must be trivially relocatable.
//! @param output An iterator to the output sequence where num_items results are written to.
//! @param num_items The number of elements in each input sequence.
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
//! operator must be assignable to the dereferenced output iterator.
//! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
int count,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
return detail::transform::
dispatch_t<true, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
dispatch(count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
dispatch(
::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
}

// temp storage overload
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
void* d_temp_storage,
size_t& temp_storage_bytes,
int count,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
Expand All @@ -180,35 +207,48 @@ struct DeviceTransform
}

return TransformStableArgumentAddresses(
count, ::cuda::std::move(inputs), ::cuda::std::move(output), ::cuda::std::move(transform_op), stream);
::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS

// One input stream, one output stream

//! @rst
//! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
//! input elements and writing the result to the corresponding output element. The objects passed to the call operator
//! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
//! @endrst
//!
//! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
//! must be trivially relocatable.
//! @param output An iterator to the output sequence where num_items results are written to.
//! @param num_items The number of elements in each input sequence.
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
//! operator must be assignable to the dereferenced output iterator.
//! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
int count,
RandomAccessIteratorIn inputs,
RandomAccessIteratorIn input,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
return TransformStableArgumentAddresses(
count,
::cuda::std::make_tuple(::cuda::std::move(inputs)),
::cuda::std::make_tuple(::cuda::std::move(input)),
::cuda::std::move(output),
num_items,
::cuda::std::move(transform_op),
stream);
}

// temp storage overload
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
void* d_temp_storage,
size_t& temp_storage_bytes,
int count,
RandomAccessIteratorIn inputs,
RandomAccessIteratorIn input,
RandomAccessIteratorOut output,
int num_items,
TransformOp transform_op,
cudaStream_t stream = nullptr)
{
Expand All @@ -219,12 +259,13 @@ struct DeviceTransform
}

return TransformStableArgumentAddresses(
count,
::cuda::std::make_tuple(::cuda::std::move(inputs)),
::cuda::std::make_tuple(::cuda::std::move(input)),
::cuda::std::move(output),
num_items,
::cuda::std::move(transform_op),
stream);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
};

CUB_NAMESPACE_END
Loading

0 comments on commit 33754cb

Please sign in to comment.