-
Notifications
You must be signed in to change notification settings - Fork 135
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Including benchmarks based on BabelStream Co-authored-by: Georgii Evtushenko <[email protected]>
- Loading branch information
1 parent
07fef97
commit 71b9f98
Showing
12 changed files
with
2,049 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
#include <cub/device/dispatch/dispatch_transform.cuh> | ||
#include <cub/util_namespace.cuh> | ||
|
||
#include <cuda/std/type_traits> | ||
|
||
#include <stdexcept> | ||
|
||
#include <nvbench_helper.cuh> | ||
|
||
template <typename... RandomAccessIteratorsIn> | ||
#if TUNE_BASE | ||
using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tuple<RandomAccessIteratorsIn...>>; | ||
#else | ||
struct policy_hub_t | ||
{ | ||
struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy> | ||
{ | ||
static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__); | ||
static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM); | ||
using algo_policy = | ||
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for, | ||
cub::detail::transform::fallback_for_policy, | ||
cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>; | ||
}; | ||
}; | ||
#endif | ||
|
||
#ifdef TUNE_T | ||
using element_types = nvbench::type_list<TUNE_T>; | ||
#else | ||
using element_types = | ||
nvbench::type_list<std::int8_t, | ||
std::int16_t, | ||
float, | ||
double | ||
# ifdef NVBENCH_HELPER_HAS_I128 | ||
, | ||
__int128 | ||
# endif | ||
>; | ||
#endif | ||
|
||
// BabelStream uses 2^25, H200 can fit 2^31 int128s | ||
// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80) | ||
// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108) | ||
// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132) | ||
// inline auto array_size_powers = std::vector<nvbench::int64_t>{28}; | ||
inline auto array_size_powers = nvbench::range(16, 28, 4); | ||
|
||
template <typename OffsetT, | ||
typename... RandomAccessIteratorsIn, | ||
typename RandomAccessIteratorOut, | ||
typename TransformOp, | ||
typename ExecTag = decltype(nvbench::exec_tag::no_batch)> | ||
void bench_transform( | ||
nvbench::state& state, | ||
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs, | ||
RandomAccessIteratorOut output, | ||
OffsetT num_items, | ||
TransformOp transform_op, | ||
ExecTag exec_tag = nvbench::exec_tag::no_batch) | ||
{ | ||
state.exec(exec_tag, [&](const nvbench::launch& launch) { | ||
cub::detail::transform::dispatch_t< | ||
false, | ||
OffsetT, | ||
::cuda::std::tuple<RandomAccessIteratorsIn...>, | ||
RandomAccessIteratorOut, | ||
TransformOp, | ||
policy_hub_t<RandomAccessIteratorsIn...>>::dispatch(inputs, output, num_items, transform_op, launch.get_stream()); | ||
}); | ||
} | ||
|
||
// Modified from BabelStream to also work for integers | ||
inline constexpr auto startA = 1; // BabelStream: 0.1 | ||
inline constexpr auto startB = 2; // BabelStream: 0.2 | ||
inline constexpr auto startC = 3; // BabelStream: 0.1 | ||
inline constexpr auto startScalar = 4; // BabelStream: 0.4 | ||
|
||
// TODO(bgruber): we should put those somewhere into libcu++: | ||
// from C++ GSL | ||
struct narrowing_error : std::runtime_error | ||
{ | ||
narrowing_error() | ||
: std::runtime_error("Narrowing error") | ||
{} | ||
}; | ||
|
||
// from C++ GSL | ||
// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow | ||
template <typename DstT, typename SrcT, ::cuda::std::__enable_if_t<::cuda::std::is_arithmetic<SrcT>::value, int> = 0> | ||
constexpr DstT narrow(SrcT value) | ||
{ | ||
constexpr bool is_different_signedness = ::cuda::std::is_signed<SrcT>::value != ::cuda::std::is_signed<DstT>::value; | ||
const auto converted = static_cast<DstT>(value); | ||
if (static_cast<SrcT>(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{})))) | ||
{ | ||
throw narrowing_error{}; | ||
} | ||
return converted; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
// %RANGE% TUNE_THREADS tpb 128:1024:128 | ||
// %RANGE% TUNE_ALGORITHM alg 0:1:1 | ||
|
||
// keep checks at the top so compilation of discarded variants fails really fast | ||
#if !TUNE_BASE | ||
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 | ||
# error "Cannot compile algorithm 4 (ublkcp) below sm90" | ||
# endif | ||
|
||
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) | ||
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" | ||
# endif | ||
#endif | ||
|
||
#include "babelstream.h" | ||
|
||
#if !TUNE_BASE | ||
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 | ||
# error "This benchmark does not support being compiled for multiple architectures" | ||
# endif | ||
#endif | ||
|
||
template <typename T, typename OffsetT> | ||
static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>) | ||
{ | ||
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(n); | ||
state.add_global_memory_writes<T>(n); | ||
|
||
const T scalar = startScalar; | ||
bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) { | ||
return ci * scalar; | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types)) | ||
.set_name("mul") | ||
.set_type_axes_names({"T{ct}", "OffsetT{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
// %RANGE% TUNE_THREADS tpb 128:1024:128 | ||
// %RANGE% TUNE_ALGORITHM alg 0:1:1 | ||
|
||
// keep checks at the top so compilation of discarded variants fails really fast | ||
#if !TUNE_BASE | ||
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 | ||
# error "Cannot compile algorithm 4 (ublkcp) below sm90" | ||
# endif | ||
|
||
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) | ||
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" | ||
# endif | ||
#endif | ||
|
||
#include "babelstream.h" | ||
|
||
#if !TUNE_BASE | ||
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 | ||
# error "This benchmark does not support being compiled for multiple architectures" | ||
# endif | ||
#endif | ||
|
||
template <typename T, typename OffsetT> | ||
static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>) | ||
{ | ||
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<T> a(n, startA); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(2 * n); | ||
state.add_global_memory_writes<T>(n); | ||
bench_transform( | ||
state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T { | ||
return ai + bi; | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types)) | ||
.set_name("add") | ||
.set_type_axes_names({"T{ct}", "OffsetT{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers); | ||
|
||
template <typename T, typename OffsetT> | ||
static void triad(nvbench::state& state, nvbench::type_list<T, OffsetT>) | ||
{ | ||
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<T> a(n, startA); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(2 * n); | ||
state.add_global_memory_writes<T>(n); | ||
const T scalar = startScalar; | ||
bench_transform( | ||
state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) { | ||
return bi + scalar * ci; | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types)) | ||
.set_name("triad") | ||
.set_type_axes_names({"T{ct}", "OffsetT{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
// %RANGE% TUNE_THREADS tpb 128:1024:128 | ||
// %RANGE% TUNE_ALGORITHM alg 0:1:1 | ||
|
||
// keep checks at the top so compilation of discarded variants fails really fast | ||
#if !TUNE_BASE | ||
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 | ||
# error "Cannot compile algorithm 4 (ublkcp) below sm90" | ||
# endif | ||
|
||
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) | ||
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" | ||
# endif | ||
#endif | ||
|
||
#include "babelstream.h" | ||
|
||
#if !TUNE_BASE | ||
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 | ||
# error "This benchmark does not support being compiled for multiple architectures" | ||
# endif | ||
#endif | ||
|
||
template <typename T, typename OffsetT> | ||
static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>) | ||
{ | ||
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}")); | ||
const auto overwrite = static_cast<bool>(state.get_int64("OverwriteInput")); | ||
thrust::device_vector<T> a(n, startA); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
// The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes | ||
// the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to | ||
// choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input. | ||
thrust::device_vector<T> d; | ||
if (!overwrite) | ||
{ | ||
d.resize(n); | ||
} | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(3 * n); | ||
state.add_global_memory_writes<T>(n); | ||
const T scalar = startScalar; | ||
bench_transform( | ||
state, | ||
::cuda::std::tuple{a.begin(), b.begin(), c.begin()}, | ||
overwrite ? a.begin() : d.begin(), | ||
n, | ||
[=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) { | ||
return ai + bi + scalar * ci; | ||
}, | ||
nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when | ||
// OverwriteInput=0, but doesn't hurt | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types)) | ||
.set_name("nstream") | ||
.set_type_axes_names({"T{ct}", "OffsetT{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers) | ||
.add_int64_axis("OverwriteInput", {1}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.