Skip to content

Commit

Permalink
Use a constant for the amount of static SMEM (#2374)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Sep 6, 2024
1 parent fcf7c91 commit 07fef97
Show file tree
Hide file tree
Showing 8 changed files with 23 additions and 11 deletions.
3 changes: 2 additions & 1 deletion cub/benchmarks/bench/radix_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
******************************************************************************/

#include <cub/device/device_radix_sort.cuh>
#include <cub/util_arch.cuh>

#include <cuda/std/type_traits>

Expand Down Expand Up @@ -123,7 +124,7 @@ constexpr std::size_t max_temp_storage_size()
template <typename KeyT, typename ValueT, typename OffsetT>
constexpr bool fits_in_default_shared_memory()
{
return max_temp_storage_size<KeyT, ValueT, OffsetT>() < 48 * 1024;
return max_temp_storage_size<KeyT, ValueT, OffsetT>() < cub::detail::max_smem_per_block;
}
#else // TUNE_BASE
template <typename, typename, typename>
Expand Down
3 changes: 2 additions & 1 deletion cub/benchmarks/bench/radix_sort/pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
******************************************************************************/

#include <cub/device/device_radix_sort.cuh>
#include <cub/util_arch.cuh>

#include <cuda/std/type_traits>

Expand Down Expand Up @@ -121,7 +122,7 @@ constexpr std::size_t max_temp_storage_size()
template <typename KeyT, typename ValueT, typename OffsetT>
constexpr bool fits_in_default_shared_memory()
{
return max_temp_storage_size<KeyT, ValueT, OffsetT>() < 48 * 1024;
return max_temp_storage_size<KeyT, ValueT, OffsetT>() < cub::detail::max_smem_per_block;
}
#else // TUNE_BASE
template <typename, typename, typename>
Expand Down
13 changes: 11 additions & 2 deletions cub/cub/util_arch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,21 @@ static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
# define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0)
# endif

namespace detail
{
// The maximum amount of static shared memory available per thread block
// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024;
} // namespace detail

template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T>
struct RegBoundScaling
{
enum
{
ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
};
};

Expand All @@ -153,7 +161,8 @@ struct MemBoundScaling
{
ITEMS_PER_THREAD =
CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
};
};

Expand Down
1 change: 1 addition & 0 deletions cub/cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#endif // no system header

#include <cub/detail/uninitialized_copy.cuh>
#include <cub/util_deprecated.cuh>

#include <cuda/std/cstdint>
#include <cuda/std/limits>
Expand Down
5 changes: 1 addition & 4 deletions cub/cub/util_vsmem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
# pragma system_header
#endif // no system header

#include <cub/util_arch.cuh>
#include <cub/util_device.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>
Expand All @@ -67,10 +68,6 @@ struct vsmem_t
void* gmem_ptr;
};

// The maximum amount of static shared memory available per thread block
// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
static constexpr std::size_t max_smem_per_block = 48 * 1024;

/**
* @brief Class template that helps to prevent exceeding the available shared memory per thread block.
*
Expand Down
3 changes: 2 additions & 1 deletion cub/test/catch2_test_block_load.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cub/block/block_load.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_arch.cuh>

#include "catch2_test_helper.h"

Expand Down Expand Up @@ -113,7 +114,7 @@ void block_load(InputIteratorT input, OutputIteratorT output, int num_items)
using input_t = cub::detail::value_t<InputIteratorT>;
using block_load_t = cub::BlockLoad<input_t, ThreadsInBlock, ItemsPerThread, LoadAlgorithm>;
using storage_t = typename block_load_t::TempStorage;
constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48;
constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block;

kernel<InputIteratorT, OutputIteratorT, ItemsPerThread, ThreadsInBlock, LoadAlgorithm>
<<<1, ThreadsInBlock>>>(std::integral_constant<bool, sufficient_resources>{}, input, output, num_items);
Expand Down
3 changes: 2 additions & 1 deletion cub/test/catch2_test_block_store.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <cub/iterator/cache_modified_output_iterator.cuh>
#include <cub/iterator/discard_output_iterator.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_arch.cuh>

#include "catch2_test_helper.h"

Expand Down Expand Up @@ -114,7 +115,7 @@ void block_store(InputIteratorT input, OutputIteratorT output, int num_items)
using input_t = cub::detail::value_t<InputIteratorT>;
using block_store_t = cub::BlockStore<input_t, ThreadsInBlock, ItemsPerThread, StoreAlgorithm>;
using storage_t = typename block_store_t::TempStorage;
constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48;
constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block;

kernel<InputIteratorT, OutputIteratorT, ItemsPerThread, ThreadsInBlock, StoreAlgorithm>
<<<1, ThreadsInBlock>>>(std::integral_constant<bool, sufficient_resources>{}, input, output, num_items);
Expand Down
3 changes: 2 additions & 1 deletion cub/test/test_block_radix_rank.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <cub/block/block_store.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_vsmem.cuh>

#include <algorithm>
#include <iostream>
Expand Down Expand Up @@ -240,7 +241,7 @@ void Test()
cub::detail::block_radix_rank_t<RankAlgorithm, BlockThreads, RadixBits, Descending, ScanAlgorithm>;
using storage_t = typename block_radix_rank::TempStorage;

cub::Int2Type<(sizeof(storage_t) <= 48 * 1024)> fits_smem_capacity;
cub::Int2Type<(sizeof(storage_t) <= cub::detail::max_smem_per_block)> fits_smem_capacity;

TestValid<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(fits_smem_capacity);
}
Expand Down

0 comments on commit 07fef97

Please sign in to comment.