Skip to content

Commit

Permalink
Make thrust::transform use cub::DeviceTransform
Browse files Browse the repository at this point in the history
* Introduces address stability detection and opt-in in libcu++
* Mark BabelStream lambdas address oblivious
  • Loading branch information
bernhardmgruber committed Aug 19, 2024
1 parent 27ba2e4 commit eeb0202
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 28 deletions.
64 changes: 64 additions & 0 deletions libcudacxx/include/cuda/std/__type_traits/address_stability.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H
#define _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__type_traits/void_t.h>
#include <cuda/std/__utility/move.h>

_LIBCUDACXX_BEGIN_NAMESPACE_STD

// need a separate implementation trait because we SFINAE with a type parameter before the variadic pack
template <typename F, typename SFINAE, typename... Args>
struct __can_copy_arguments_impl : false_type
{};

template <typename F, typename... Args>
struct __can_copy_arguments_impl<F, void_t<decltype(F::can_copy_arguments)>, Args...>
{
static constexpr bool value = F::can_copy_arguments;
};

//! Trait telling whether a function object relies on the memory address of its arguments when called with the given set
//! of types. The nested value is true when the addresses of the arguments do not matter and arguments can be provided
//! from arbitrary copies of the respective sources.
template <typename F, typename... Args>
using can_copy_arguments = __can_copy_arguments_impl<F, void, Args...>;

//! Wrapper for a callable to mark it as allowing copied arguments
template <typename F>
struct copied_arguments_allowing_wrapper : F
{
using F::operator();
static constexpr bool can_copy_arguments = true;
};

//! Creates a new function object from an existing one, allowing its arguments to be copies of whatever source they come
//! from. This implies that the addresses of the arguments are irrelevant to the function object.
template <typename F>
_CCCL_HOST_DEVICE constexpr auto allow_copied_arguments(F f) -> copied_arguments_allowing_wrapper<F>
{
return copied_arguments_allowing_wrapper<F>{_CUDA_VSTD::move(f)};
}

_LIBCUDACXX_END_NAMESPACE_STD

#endif // _LIBCUDACXX___TYPE_TRAITS_ADDRESS_STABILITY_H
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ namespace std
#include <cuda/std/__type_traits/add_pointer.h>
#include <cuda/std/__type_traits/add_rvalue_reference.h>
#include <cuda/std/__type_traits/add_volatile.h>
#include <cuda/std/__type_traits/address_stability.h>
#include <cuda/std/__type_traits/aligned_storage.h>
#include <cuda/std/__type_traits/aligned_union.h>
#include <cuda/std/__type_traits/alignment_of.h>
Expand Down
40 changes: 28 additions & 12 deletions thrust/benchmarks/bench/transform/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
#include <thrust/transform.h>
#include <thrust/zip_function.h>

#include <cuda/functional>
#include <cuda/std/__type_traits/address_stability.h>

#include <nvbench_helper.cuh>

template <class InT, class OutT>
Expand Down Expand Up @@ -121,9 +124,10 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
const T scalar = startScalar;
thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) {
return ci * scalar;
});
thrust::transform(
c.begin(), c.end(), b.begin(), cuda::std::allow_copied_arguments([=] __device__ __host__(const T& ci) {
return ci * scalar;
}));
});
}

Expand All @@ -145,9 +149,14 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(n);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) {
return ai + bi;
});
thrust::transform(
a.begin(),
a.end(),
b.begin(),
c.begin(),
cuda::std::allow_copied_arguments([] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
return ai + bi;
}));
});
}

Expand All @@ -170,9 +179,14 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
const T scalar = startScalar;
thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) {
return bi + scalar * ci;
});
thrust::transform(
b.begin(),
b.end(),
c.begin(),
a.begin(),
cuda::std::allow_copied_arguments([=] _CCCL_DEVICE(const T& bi, const T& ci) {
return bi + scalar * ci;
}));
});
}

Expand All @@ -199,9 +213,11 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()),
thrust::make_zip_iterator(a.end(), b.end(), c.end()),
a.begin(),
thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
}));

thrust::make_zip_function(
cuda::std::allow_copied_arguments([=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
})));
});
}

Expand Down
26 changes: 26 additions & 0 deletions thrust/testing/address_stability.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include <cuda/std/__type_traits/address_stability.h>

#include <unittest/unittest.h>

// TODO(bgruber): move this test into libcu++

struct MyPlus
{
_CCCL_HOST_DEVICE auto operator()(int a, int b) const -> int
{
return a + b;
}
};

void TestAddressStability()
{
using ::cuda::std::allow_copied_arguments;
using ::cuda::std::can_copy_arguments;

static_assert(!can_copy_arguments<thrust::plus<int>, int, int>::value, "");
static_assert(can_copy_arguments<decltype(allow_copied_arguments(thrust::plus<int>{})), int, int>::value, "");

static_assert(!can_copy_arguments<MyPlus, int, int>::value, "");
static_assert(can_copy_arguments<decltype(allow_copied_arguments(MyPlus{})), int, int>::value, "");
}
DECLARE_UNITTEST(TestAddressStability);
73 changes: 73 additions & 0 deletions thrust/testing/cuda/transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -413,3 +413,76 @@ void TestTransformBinaryCudaStreams()
cudaStreamDestroy(s);
}
DECLARE_UNITTEST(TestTransformBinaryCudaStreams);

struct Sum
{
_CCCL_HOST_DEVICE auto
operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
{
return a + b + c + d + e;
}
};

// we specialize zip_function for Sum, but do nothing in the call operator so the test below would fail
THRUST_NAMESPACE_BEGIN
template <>
class zip_function<Sum>
{
public:
_CCCL_HOST_DEVICE zip_function(Sum func)
: func(func)
{}

_CCCL_HOST_DEVICE Sum& underlying_function() const
{
return func;
}

template <typename Tuple>
_CCCL_HOST_DEVICE auto
operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval<Sum>(), THRUST_FWD(t)))
{
// not calling func, so we would get a wrong result if we were called
return {};
}

private:
mutable Sum func;
};
THRUST_NAMESPACE_END

// test that the cuda_cub backend of Thrust unwraps zip_iterators/zip_functions into their input streams
void TestTransformZipIteratorUnwrapping()
{
constexpr int num_items = 100;
thrust::device_vector<std::int8_t> a(num_items, 1);
thrust::device_vector<std::int16_t> b(num_items, 2);
thrust::device_vector<std::int32_t> c(num_items, 3);
thrust::device_vector<std::int64_t> d(num_items, 4);
thrust::device_vector<float> e(num_items, 5);

thrust::device_vector<double> result(num_items);
// SECTION("once")
{
const auto z = thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin());
thrust::transform(z, z + num_items, result.begin(), thrust::make_zip_function(Sum{}));

// compute reference and verify
thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
ASSERT_EQUAL(reference, result);
}
// SECTION("trice")
{
const auto z = thrust::make_zip_iterator(
thrust::make_zip_iterator(thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin())));
thrust::transform(z,
z + num_items,
result.begin(),
thrust::make_zip_function(thrust::make_zip_function(thrust::make_zip_function(Sum{}))));

// compute reference and verify
thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
ASSERT_EQUAL(reference, result);
}
}
DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping);
1 change: 1 addition & 0 deletions thrust/testing/transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <thrust/pair.h>
#include <thrust/transform.h>
#include <thrust/tuple.h>
#include <thrust/zip_function.h>

#include <unittest/unittest.h>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
THRUST_NAMESPACE_BEGIN
namespace cuda_cub
{
// Need a forward declaration here to work around a cyclic include, since "cuda/detail/transform.h" includes this header
template <class Derived, class InputIt, class OutputIt, class TransformOp>
OutputIt THRUST_FUNCTION
transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);

namespace __copy
{
Expand Down
Loading

0 comments on commit eeb0202

Please sign in to comment.