Skip to content

Commit

Permalink
Use prefetch when there are no inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Aug 19, 2024
1 parent c8b718a commit dc5530c
Showing 1 changed file with 13 additions and 13 deletions.
26 changes: 13 additions & 13 deletions cub/cub/device/dispatch/dispatch_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -542,19 +542,18 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
::cuda::std::conjunction<THRUST_NS_QUALIFIER::is_trivially_relocatable<value_t<RandomAccessIteratorsIn>>...>::value;

static constexpr bool can_memcpy = all_contiguous && all_values_trivially_reloc;
// no_input_streams || !all_contiguous ? Algorithm::prefetch
// : !RequiresStableAddress && all_values_trivially_reloc
// ? ActivePolicy::alg_addr_unstable
// : ActivePolicy::alg_addr_stable;

// TODO(bgruber): consider a separate kernel for just filling

// below A100
struct policy300 : ChainedPolicy<300, policy300, policy300>
{
static constexpr int min_bif = arch_to_min_bif(300);
// TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
static constexpr auto algorithm = RequiresStableAddress ? Algorithm::prefetch : Algorithm::unrolled_staged;
static constexpr auto algorithm =
RequiresStableAddress || no_input_streams ? Algorithm::prefetch : Algorithm::unrolled_staged;
using algo_policy =
::cuda::std::_If<RequiresStableAddress,
::cuda::std::_If<RequiresStableAddress || no_input_streams,
prefetch_policy_t<256>,
unrolled_policy_t<256, items_per_thread_from_occupancy(256, 8, min_bif, loaded_bytes_per_iter)>>;
};
Expand All @@ -567,20 +566,21 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
static constexpr int min_bif = arch_to_min_bif(800);
// TODO(bgruber): we could use unrolled_staged if we cannot memcpy
static constexpr auto algorithm =
(RequiresStableAddress || !can_memcpy) ? Algorithm::prefetch : Algorithm::memcpy_async;
using algo_policy =
::cuda::std::_If<RequiresStableAddress || !can_memcpy, prefetch_policy_t<256>, async_copy_policy_t<256>>;
(RequiresStableAddress || !can_memcpy || no_input_streams) ? Algorithm::prefetch : Algorithm::memcpy_async;
using algo_policy = ::cuda::std::
_If<RequiresStableAddress || !can_memcpy || no_input_streams, prefetch_policy_t<256>, async_copy_policy_t<256>>;
};

// TODO(bgruber): should we add a tuning for 860? They should have items_per_thread_from_occupancy(256, 6, ...)

// H100 and H200
struct policy900 : ChainedPolicy<900, policy900, policy800>
{
static constexpr int min_bif = arch_to_min_bif(900);
static constexpr auto algorithm = (RequiresStableAddress || !can_memcpy) ? Algorithm::prefetch : Algorithm::ublkcp;
using algo_policy =
::cuda::std::_If<RequiresStableAddress || !can_memcpy, prefetch_policy_t<256>, async_copy_policy_t<256>>;
static constexpr int min_bif = arch_to_min_bif(900);
static constexpr auto algorithm =
(RequiresStableAddress || !can_memcpy || no_input_streams) ? Algorithm::prefetch : Algorithm::ublkcp;
using algo_policy = ::cuda::std::
_If<RequiresStableAddress || !can_memcpy || no_input_streams, prefetch_policy_t<256>, async_copy_policy_t<256>>;
};

using max_policy = policy900;
Expand Down

0 comments on commit dc5530c

Please sign in to comment.