Skip to content

Commit

Permalink
Add cub::DeviceTransform (#2086)
Browse files Browse the repository at this point in the history
Including benchmarks based on BabelStream

Co-authored-by: Georgii Evtushenko <[email protected]>
  • Loading branch information
bernhardmgruber and gevtushenko committed Sep 8, 2024
1 parent 07fef97 commit 71b9f98
Show file tree
Hide file tree
Showing 12 changed files with 2,049 additions and 2 deletions.
2 changes: 1 addition & 1 deletion cub/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ function(add_bench_dir bench_dir)
add_bench(bench_target ${tuning_name} "${bench_src}")
# for convenience, make tuning variant buildable by default
file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}")
target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}")
else()
# benchmarking
register_cccl_benchmark("${bench_name}" "")
Expand Down
104 changes: 104 additions & 0 deletions cub/benchmarks/bench/transform/babelstream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#include <cub/device/dispatch/dispatch_transform.cuh>
#include <cub/util_namespace.cuh>

#include <cuda/std/type_traits>

#include <stdexcept>

#include <nvbench_helper.cuh>

template <typename... RandomAccessIteratorsIn>
#if TUNE_BASE
using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tuple<RandomAccessIteratorsIn...>>;
#else
struct policy_hub_t
{
struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
{
static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
using algo_policy =
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for,
cub::detail::transform::fallback_for_policy,
cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
};
};
#endif

#ifdef TUNE_T
using element_types = nvbench::type_list<TUNE_T>;
#else
using element_types =
nvbench::type_list<std::int8_t,
std::int16_t,
float,
double
# ifdef NVBENCH_HELPER_HAS_I128
,
__int128
# endif
>;
#endif

// BabelStream uses 2^25, H200 can fit 2^31 int128s
// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
// inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
inline auto array_size_powers = nvbench::range(16, 28, 4);

template <typename OffsetT,
typename... RandomAccessIteratorsIn,
typename RandomAccessIteratorOut,
typename TransformOp,
typename ExecTag = decltype(nvbench::exec_tag::no_batch)>
void bench_transform(
nvbench::state& state,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
OffsetT num_items,
TransformOp transform_op,
ExecTag exec_tag = nvbench::exec_tag::no_batch)
{
state.exec(exec_tag, [&](const nvbench::launch& launch) {
cub::detail::transform::dispatch_t<
false,
OffsetT,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
TransformOp,
policy_hub_t<RandomAccessIteratorsIn...>>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
});
}

// Modified from BabelStream to also work for integers
inline constexpr auto startA = 1; // BabelStream: 0.1
inline constexpr auto startB = 2; // BabelStream: 0.2
inline constexpr auto startC = 3; // BabelStream: 0.1
inline constexpr auto startScalar = 4; // BabelStream: 0.4

// TODO(bgruber): we should put those somewhere into libcu++:
// from C++ GSL
struct narrowing_error : std::runtime_error
{
narrowing_error()
: std::runtime_error("Narrowing error")
{}
};

// from C++ GSL
// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow
template <typename DstT, typename SrcT, ::cuda::std::__enable_if_t<::cuda::std::is_arithmetic<SrcT>::value, int> = 0>
constexpr DstT narrow(SrcT value)
{
constexpr bool is_different_signedness = ::cuda::std::is_signed<SrcT>::value != ::cuda::std::is_signed<DstT>::value;
const auto converted = static_cast<DstT>(value);
if (static_cast<SrcT>(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{}))))
{
throw narrowing_error{};
}
return converted;
}
46 changes: 46 additions & 0 deletions cub/benchmarks/bench/transform/babelstream1.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(n);
state.add_global_memory_writes<T>(n);

const T scalar = startScalar;
bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) {
return ci * scalar;
});
}

NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("mul")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);
69 changes: 69 additions & 0 deletions cub/benchmarks/bench/transform/babelstream2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
bench_transform(
state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
return ai + bi;
});
}

NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("add")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T, typename OffsetT>
static void triad(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
const T scalar = startScalar;
bench_transform(
state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
return bi + scalar * ci;
});
}

NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("triad")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);
64 changes: 64 additions & 0 deletions cub/benchmarks/bench/transform/babelstream3.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
const auto overwrite = static_cast<bool>(state.get_int64("OverwriteInput"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

// The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes
// the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to
// choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input.
thrust::device_vector<T> d;
if (!overwrite)
{
d.resize(n);
}

state.add_element_count(n);
state.add_global_memory_reads<T>(3 * n);
state.add_global_memory_writes<T>(n);
const T scalar = startScalar;
bench_transform(
state,
::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
overwrite ? a.begin() : d.begin(),
n,
[=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
},
nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when
// OverwriteInput=0, but doesn't hurt
}

NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("nstream")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers)
.add_int64_axis("OverwriteInput", {1});
1 change: 1 addition & 0 deletions cub/cub/cub.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
#include <cub/device/device_segmented_sort.cuh>
#include <cub/device/device_select.cuh>
#include <cub/device/device_spmv.cuh>
#include <cub/device/device_transform.cuh>

// Grid
// #include <cub/grid/grid_barrier.cuh>
Expand Down
Loading

0 comments on commit 71b9f98

Please sign in to comment.