Skip to content

Commit

Permalink
Merge branch 'main' into increase_timeout_libcu++
Browse files Browse the repository at this point in the history
  • Loading branch information
alliepiper committed Sep 19, 2024
2 parents f6de260 + ee94bb9 commit 89b5ac5
Show file tree
Hide file tree
Showing 44 changed files with 389 additions and 141 deletions.
32 changes: 23 additions & 9 deletions .github/actions/workflow-build/build-workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ def lookup_supported_stds(matrix_job):
if 'project' in matrix_job:
project = get_project(matrix_job['project'])
stds = stds & set(project['stds'])
if len(stds) == 0:
raise Exception(error_message_with_matrix_job(matrix_job, "No supported stds found."))
return sorted(list(stds))


Expand Down Expand Up @@ -626,18 +628,18 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
matching_consumers = merged_consumers[producer_index]

producer_name = producer['name']
print(f"::notice file=ci/matrix.yaml::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'",
print(f"::notice::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'",
file=sys.stderr)
consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
print(f"::notice file=ci/matrix.yaml::Original consumers: {consumer_names}", file=sys.stderr)
print(f"::notice::Original consumers: {consumer_names}", file=sys.stderr)
consumer_names = ", ".join([consumer['name'] for consumer in consumers])
print(f"::notice file=ci/matrix.yaml::Duplicate consumers: {consumer_names}", file=sys.stderr)
print(f"::notice::Duplicate consumers: {consumer_names}", file=sys.stderr)
# Merge if unique:
for consumer in consumers:
if not dispatch_job_in_container(consumer, matching_consumers):
matching_consumers.append(consumer)
consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
print(f"::notice file=ci/matrix.yaml::Merged consumers: {consumer_names}", file=sys.stderr)
print(f"::notice::Merged consumers: {consumer_names}", file=sys.stderr)
else:
merged_producers.append(producer)
merged_consumers.append(consumers)
Expand All @@ -653,7 +655,7 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
unique_standalone_jobs = []
for job_json in standalone_jobs:
if dispatch_job_in_container(job_json, unique_standalone_jobs):
print(f"::notice file=ci/matrix.yaml::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'",
print(f"::notice::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'",
file=sys.stderr)
else:
unique_standalone_jobs.append(job_json)
Expand All @@ -663,12 +665,12 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
for two_stage_job in two_stage_jobs:
for producer in two_stage_job['producers']:
if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
print(f"::notice file=ci/matrix.yaml::Removing standalone job '{producer['name']}' " +
print(f"::notice::Removing standalone job '{producer['name']}' " +
f"as it appears as a producer in '{group_name}'",
file=sys.stderr)
for consumer in two_stage_job['consumers']:
if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
print(f"::notice file=ci/matrix.yaml::Removing standalone job '{consumer['name']}' " +
print(f"::notice::Removing standalone job '{consumer['name']}' " +
f"as it appears as a consumer in '{group_name}'",
file=sys.stderr)
standalone_jobs = list(unique_standalone_jobs)
Expand Down Expand Up @@ -864,8 +866,20 @@ def set_derived_tags(matrix_job):
gpu = get_gpu(matrix_job['gpu'])
matrix_job['sm'] = gpu['sm']

if 'std' in matrix_job and matrix_job['std'] == 'all':
matrix_job['std'] = lookup_supported_stds(matrix_job)
if 'std' in matrix_job:
if matrix_job['std'] == 'all':
matrix_job['std'] = lookup_supported_stds(matrix_job)
elif matrix_job['std'] == 'min':
matrix_job['std'] = min(lookup_supported_stds(matrix_job))
elif matrix_job['std'] == 'max':
matrix_job['std'] = max(lookup_supported_stds(matrix_job))
elif matrix_job['std'] == 'minmax':
stds = lookup_supported_stds(matrix_job)
if len(stds) == 1:
matrix_job['std'] = stds[0]
else:
matrix_job['std'] = [min(stds), max(stds)]


# Add all deps before applying project job maps:
for job in matrix_job['jobs']:
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[Documentation](https://nvidia.github.io/cccl)|
|-|-|-|-|-|-|

# CUDA C++ Core Libraries (CCCL)
# CUDA Core Compute Libraries (CCCL)

Welcome to the CUDA C++ Core Libraries (CCCL) where our mission is to make CUDA C++ more delightful.
Welcome to the CUDA Core Compute Libraries (CCCL) where our mission is to make CUDA more delightful.

This repository unifies three essential CUDA C++ libraries into a single, convenient repository:

Expand All @@ -19,7 +19,7 @@ For more information about the decision to unify these projects, see the [announ

## Overview

The concept for the CUDA C++ Core Libraries (CCCL) grew organically out of the Thrust, CUB, and libcudacxx projects that were developed independently over the years with a similar goal: to provide high-quality, high-performance, and easy-to-use C++ abstractions for CUDA developers.
The concept for the CUDA Core Compute Libraries (CCCL) grew organically out of the Thrust, CUB, and libcudacxx projects that were developed independently over the years with a similar goal: to provide high-quality, high-performance, and easy-to-use C++ abstractions for CUDA developers.
Naturally, there was a lot of overlap among the three projects, and it became clear the community would be better served by unifying them into a single repository.

- **Thrust** is the C++ parallel algorithms library which inspired the introduction of parallel algorithms to the C++ Standard Library. Thrust's high-level interface greatly enhances programmer productivity while enabling performance portability between GPUs and multicore CPUs via configurable backends that allow using multiple parallel programming frameworks (such as CUDA, TBB, and OpenMP).
Expand Down Expand Up @@ -413,7 +413,7 @@ For a detailed overview of the CI pipeline, see [ci-overview.md](ci-overview.md)
## Related Projects
Projects that are related to CCCL's mission to make CUDA C++ more delightful:
Projects that are related to CCCL's mission to make CUDA more delightful:
- [cuCollections](https://github.com/NVIDIA/cuCollections) - GPU accelerated data structures like hash tables
- [NVBench](https://github.com/NVIDIA/nvbench) - Benchmarking library tailored for CUDA applications
- [stdexec](https://github.com/nvidia/stdexec) - Reference implementation for Senders asynchronous programming model
Expand Down
57 changes: 36 additions & 21 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@ workflows:
# Old CTK
- {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc6', 'gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2017']}
- {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'}
# Current CTK
# Current CTK build-only
- {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
- {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
- {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']}
- {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
# Current CTK testing:
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'minmax', cxx: ['gcc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
# Split up cub tests:
- {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'minmax', cxx: ['gcc']}
- {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
- {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']}
# Modded builds:
- {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
- {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'}
Expand All @@ -36,33 +42,41 @@ workflows:
- {jobs: ['build'], project: 'cudax', ctk: ['12.0' ], std: 17, cxx: ['gcc12'], sm: "90"}
- {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 17, cxx: ['gcc13'], sm: "90a"}
- {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'min', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']}
# Python jobs:
- {jobs: ['test'], project: 'pycuda', ctk: ['12.5']}
# cccl-infra:
- {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']}
- {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']}
# Edge-case jobs
- {jobs: ['limited'], project: 'cub', std: 17}

nightly:
- {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11]}
- {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17]}
- {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]}
- {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]}
- {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'}
- {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11]}
# H100 runners are currently flakey, only build since those use CPU-only runners:
- {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]}
- {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]}
# Increased test coverage compared to nightlies:
- {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']}
# Edge-case jobs
- {jobs: ['limited'], project: 'cub', std: 17}

# nvrtc:
- {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']}
- {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']}
- {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all', project: ['libcudacxx']}
- {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc13', std: [11, 20], project: ['libcudacxx']}
# # These are waiting on the NVKS nodes:
# - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11]}
# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17]}
# - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]}
# - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]}
# - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'}
# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11]}
# # H100 runners are currently flakey, only build since those use CPU-only runners:
# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]}
# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]}
#
# # nvrtc:
# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']}
# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']}
# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all', project: ['libcudacxx']}
# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc13', std: [11, 20], project: ['libcudacxx']}

# Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
exclude:
Expand Down Expand Up @@ -256,6 +270,7 @@ tags:
project: { default: ['libcudacxx', 'cub', 'thrust'] }
# C++ standard
# If set to 'all', all stds supported by the ctk/compilers/project are used.
# If set to 'min', 'max', or 'minmax', the minimum, maximum, or both stds are used.
# If set, will be passed to script with `-std <std>`.
std: { required: false }
# GPU architecture
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ struct AgentRadixSortHistogram
// Within a portion, avoid overflowing (u)int32 counters.
// Between portions, accumulate results in global memory.
constexpr OffsetT MAX_PORTION_SIZE = 1 << 30;
OffsetT num_portions = cub::DivideAndRoundUp(num_items, MAX_PORTION_SIZE);
OffsetT num_portions = ::cuda::ceil_div(num_items, MAX_PORTION_SIZE);
for (OffsetT portion = 0; portion < num_portions; ++portion)
{
// Reset the counters.
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/device_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ private:

if (is_aligned<typename wrapped_op_t::vector_t>(unwrapped_first))
{ // Vectorize loads
const OffsetT num_vec_items = cub::DivideAndRoundUp(num_items, wrapped_op_t::vec_size);
const OffsetT num_vec_items = ::cuda::ceil_div(num_items, wrapped_op_t::vec_size);

return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(
num_vec_items,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
do
{
constexpr int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE;
const int num_tiles = static_cast<int>(DivideAndRoundUp(num_items, tile_size));
const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));

std::size_t first_tile_previous_size = MayAlias * num_tiles * sizeof(InputT);

Expand Down Expand Up @@ -244,7 +244,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
using AgentDifferenceInitT = AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;

constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
const int init_grid_size = DivideAndRoundUp(num_tiles, init_block_size);
const int init_grid_size = ::cuda::ceil_div(num_tiles, init_block_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ struct DispatchBatchMemcpy : SelectedPolicy
* ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD;

// The number of thread blocks (or tiles) required to process all of the given buffers
BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE);
BlockOffsetT num_tiles = ::cuda::ceil_div(num_buffers, TILE_SIZE);

using BlevBufferSrcsOutT = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<InputBufferIt>>;
using BlevBufferDstOutT = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<OutputBufferIt>>;
Expand Down Expand Up @@ -528,7 +528,7 @@ struct DispatchBatchMemcpy : SelectedPolicy
BlevBufferTileOffsetsOutItT d_blev_block_offsets = blev_buffer_block_alloc.get();

// Kernels' grid sizes
BlockOffsetT init_grid_size = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
BlockOffsetT init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
BlockOffsetT batch_memcpy_grid_size = num_tiles;

// Kernels
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ struct dispatch_t : PolicyHubT
constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;

const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
Expand Down Expand Up @@ -144,7 +144,7 @@ struct dispatch_t : PolicyHubT
constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;

const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ struct dispatch_histogram

// Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
int pixels_per_tile = block_threads * pixels_per_thread;
int tiles_per_row = static_cast<int>(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile));
int tiles_per_row = static_cast<int>(::cuda::ceil_div(num_row_pixels, pixels_per_tile));
int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
int blocks_per_col =
(blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : 0;
Expand Down
5 changes: 2 additions & 3 deletions cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ struct dispatch_t
typename choose_merge_agent<merge_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>::
type;

const auto num_tiles = cub::DivideAndRoundUp(num_items1 + num_items2, agent_t::policy::ITEMS_PER_TILE);
const auto num_tiles = ::cuda::ceil_div(num_items1 + num_items2, agent_t::policy::ITEMS_PER_TILE);
void* allocations[2] = {nullptr, nullptr};
{
const std::size_t merge_partitions_size = (1 + num_tiles) * sizeof(Offset);
Expand All @@ -263,8 +263,7 @@ struct dispatch_t
{
const Offset num_partitions = num_tiles + 1;
constexpr int threads_per_partition_block = 256; // TODO(bgruber): no policy?
const int partition_grid_size =
static_cast<int>(cub::DivideAndRoundUp(num_partitions, threads_per_partition_block));
const int partition_grid_size = static_cast<int>(::cuda::ceil_div(num_partitions, threads_per_partition_block));

auto error = CubDebug(
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
Expand Down
Loading

0 comments on commit 89b5ac5

Please sign in to comment.