Merge branch 'main' into increase_timeout_libcu++

NVIDIA · Sep 19, 2024 · 89b5ac5 · 89b5ac5
2 parents f6de260 + ee94bb9
commit 89b5ac5
Show file tree

Hide file tree

Showing 44 changed files with 389 additions and 141 deletions.
diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py
@@ -334,6 +334,8 @@ def lookup_supported_stds(matrix_job):
     if 'project' in matrix_job:
         project = get_project(matrix_job['project'])
         stds = stds & set(project['stds'])
+    if len(stds) == 0:
+        raise Exception(error_message_with_matrix_job(matrix_job, "No supported stds found."))
     return sorted(list(stds))
 
 
@@ -626,18 +628,18 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
                 matching_consumers = merged_consumers[producer_index]
 
                 producer_name = producer['name']
-                print(f"::notice file=ci/matrix.yaml::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'",
+                print(f"::notice::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'",
                       file=sys.stderr)
                 consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
-                print(f"::notice file=ci/matrix.yaml::Original consumers: {consumer_names}", file=sys.stderr)
+                print(f"::notice::Original consumers: {consumer_names}", file=sys.stderr)
                 consumer_names = ", ".join([consumer['name'] for consumer in consumers])
-                print(f"::notice file=ci/matrix.yaml::Duplicate consumers: {consumer_names}", file=sys.stderr)
+                print(f"::notice::Duplicate consumers: {consumer_names}", file=sys.stderr)
                 # Merge if unique:
                 for consumer in consumers:
                     if not dispatch_job_in_container(consumer, matching_consumers):
                         matching_consumers.append(consumer)
                 consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
-                print(f"::notice file=ci/matrix.yaml::Merged consumers: {consumer_names}", file=sys.stderr)
+                print(f"::notice::Merged consumers: {consumer_names}", file=sys.stderr)
             else:
                 merged_producers.append(producer)
                 merged_consumers.append(consumers)
@@ -653,7 +655,7 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
         unique_standalone_jobs = []
         for job_json in standalone_jobs:
             if dispatch_job_in_container(job_json, unique_standalone_jobs):
-                print(f"::notice file=ci/matrix.yaml::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'",
+                print(f"::notice::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'",
                       file=sys.stderr)
             else:
                 unique_standalone_jobs.append(job_json)
@@ -663,12 +665,12 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
         for two_stage_job in two_stage_jobs:
             for producer in two_stage_job['producers']:
                 if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
-                    print(f"::notice file=ci/matrix.yaml::Removing standalone job '{producer['name']}' " +
+                    print(f"::notice::Removing standalone job '{producer['name']}' " +
                           f"as it appears as a producer in '{group_name}'",
                           file=sys.stderr)
             for consumer in two_stage_job['consumers']:
                 if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
-                    print(f"::notice file=ci/matrix.yaml::Removing standalone job '{consumer['name']}' " +
+                    print(f"::notice::Removing standalone job '{consumer['name']}' " +
                           f"as it appears as a consumer in '{group_name}'",
                           file=sys.stderr)
         standalone_jobs = list(unique_standalone_jobs)
@@ -864,8 +866,20 @@ def set_derived_tags(matrix_job):
         gpu = get_gpu(matrix_job['gpu'])
         matrix_job['sm'] = gpu['sm']
 
-    if 'std' in matrix_job and matrix_job['std'] == 'all':
-        matrix_job['std'] = lookup_supported_stds(matrix_job)
+    if 'std' in matrix_job:
+        if matrix_job['std'] == 'all':
+            matrix_job['std'] = lookup_supported_stds(matrix_job)
+        elif matrix_job['std'] == 'min':
+            matrix_job['std'] = min(lookup_supported_stds(matrix_job))
+        elif matrix_job['std'] == 'max':
+            matrix_job['std'] = max(lookup_supported_stds(matrix_job))
+        elif matrix_job['std'] == 'minmax':
+            stds = lookup_supported_stds(matrix_job)
+            if len(stds) == 1:
+                matrix_job['std'] = stds[0]
+            else:
+                matrix_job['std'] = [min(stds), max(stds)]
+
 
     # Add all deps before applying project job maps:
     for job in matrix_job['jobs']:

diff --git a/README.md b/README.md
@@ -3,9 +3,9 @@
 |[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[Documentation](https://nvidia.github.io/cccl)|
 |-|-|-|-|-|-|
 
-# CUDA C++ Core Libraries (CCCL)
+# CUDA Core Compute Libraries (CCCL)
 
-Welcome to the CUDA C++ Core Libraries (CCCL) where our mission is to make CUDA C++ more delightful.
+Welcome to the CUDA Core Compute Libraries (CCCL) where our mission is to make CUDA more delightful.
 
 This repository unifies three essential CUDA C++ libraries into a single, convenient repository:
 
@@ -19,7 +19,7 @@ For more information about the decision to unify these projects, see the [announ
 
 ## Overview
 
-The concept for the CUDA C++ Core Libraries (CCCL) grew organically out of the Thrust, CUB, and libcudacxx projects that were developed independently over the years with a similar goal: to provide high-quality, high-performance, and easy-to-use C++ abstractions for CUDA developers.
+The concept for the CUDA Core Compute Libraries (CCCL) grew organically out of the Thrust, CUB, and libcudacxx projects that were developed independently over the years with a similar goal: to provide high-quality, high-performance, and easy-to-use C++ abstractions for CUDA developers.
 Naturally, there was a lot of overlap among the three projects, and it became clear the community would be better served by unifying them into a single repository.
 
 - **Thrust** is the C++ parallel algorithms library which inspired the introduction of parallel algorithms to the C++ Standard Library. Thrust's high-level interface greatly enhances programmer productivity while enabling performance portability between GPUs and multicore CPUs via configurable backends that allow using multiple parallel programming frameworks (such as CUDA, TBB, and OpenMP).
@@ -413,7 +413,7 @@ For a detailed overview of the CI pipeline, see [ci-overview.md](ci-overview.md)
 
 ## Related Projects
 
-Projects that are related to CCCL's mission to make CUDA C++ more delightful:
+Projects that are related to CCCL's mission to make CUDA more delightful:
 - [cuCollections](https://github.com/NVIDIA/cuCollections) - GPU accelerated data structures like hash tables
 - [NVBench](https://github.com/NVIDIA/nvbench) - Benchmarking library tailored for CUDA applications
 - [stdexec](https://github.com/nvidia/stdexec) - Reference implementation for Senders asynchronous programming model

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
@@ -13,11 +13,17 @@ workflows:
     # Old CTK
     - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc6', 'gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2017']}
     - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'}
-    # Current CTK
+    # Current CTK build-only
     - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
     - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']}
-    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
+    # Current CTK testing:
+    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'minmax', cxx: ['gcc']}
+    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max',    cxx: ['clang', 'msvc']}
+    # Split up cub tests:
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'minmax', cxx: ['gcc']}
+    - {jobs: ['test_lid1',  'test_lid2'], project: ['cub'], std: 'max',    cxx: ['gcc']}
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max',    cxx: ['clang', 'msvc']}
     # Modded builds:
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'}
@@ -36,33 +42,41 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: 17,    cxx: ['gcc12'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 17,    cxx: ['gcc13'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'min', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'max', cxx: ['clang14']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'max', cxx: ['clang18']}
     # Python jobs:
     - {jobs: ['test'], project: 'pycuda', ctk: ['12.5']}
     # cccl-infra:
     - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']}
     - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',  'clang']}
-    # Edge-case jobs
-    - {jobs: ['limited'], project: 'cub', std: 17}
 
   nightly:
-    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',    std: [11]}
-    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]}
-    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]}
-    # H100 runners are currently flakey, only build since those use CPU-only runners:
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
+    # Increased test coverage compared to nightlies:
+    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18']}
+    # Edge-case jobs
+    - {jobs: ['limited'], project: 'cub', std: 17}
 
-   # nvrtc:
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',  std: 'all',    project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc13',  std: [11, 20], project: ['libcudacxx']}
+#  # These are waiting on the NVKS nodes:
+#    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',    std: [11]}
+#    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]}
+#    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
+#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
+#    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
+#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]}
+#    # H100 runners are currently flakey, only build since those use CPU-only runners:
+#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
+#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
+#
+#   # nvrtc:
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',  std: 'all',    project: ['libcudacxx']}
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc13',  std: [11, 20], project: ['libcudacxx']}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:
@@ -256,6 +270,7 @@ tags:
   project: { default: ['libcudacxx', 'cub', 'thrust'] }
   # C++ standard
   # If set to 'all', all stds supported by the ctk/compilers/project are used.
+  # If set to 'min', 'max', or 'minmax', the minimum, maximum, or both stds are used.
   # If set, will be passed to script with `-std <std>`.
   std: { required: false }
   # GPU architecture

diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -242,7 +242,7 @@ struct AgentRadixSortHistogram
     // Within a portion, avoid overflowing (u)int32 counters.
     // Between portions, accumulate results in global memory.
     constexpr OffsetT MAX_PORTION_SIZE = 1 << 30;
-    OffsetT num_portions               = cub::DivideAndRoundUp(num_items, MAX_PORTION_SIZE);
+    OffsetT num_portions               = ::cuda::ceil_div(num_items, MAX_PORTION_SIZE);
     for (OffsetT portion = 0; portion < num_portions; ++portion)
     {
       // Reset the counters.

diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
@@ -154,7 +154,7 @@ private:
 
     if (is_aligned<typename wrapped_op_t::vector_t>(unwrapped_first))
     { // Vectorize loads
-      const OffsetT num_vec_items = cub::DivideAndRoundUp(num_items, wrapped_op_t::vec_size);
+      const OffsetT num_vec_items = ::cuda::ceil_div(num_items, wrapped_op_t::vec_size);
 
       return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(
         num_vec_items,

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -205,7 +205,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
     do
     {
       constexpr int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE;
-      const int num_tiles     = static_cast<int>(DivideAndRoundUp(num_items, tile_size));
+      const int num_tiles     = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
 
       std::size_t first_tile_previous_size = MayAlias * num_tiles * sizeof(InputT);
 
@@ -244,7 +244,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
         using AgentDifferenceInitT = AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;
 
         constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
-        const int init_grid_size      = DivideAndRoundUp(num_tiles, init_block_size);
+        const int init_grid_size      = ::cuda::ceil_div(num_tiles, init_block_size);
 
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
         _CubLog("Invoking DeviceAdjacentDifferenceInitKernel"

diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -453,7 +453,7 @@ struct DispatchBatchMemcpy : SelectedPolicy
                                  * ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD;
 
     // The number of thread blocks (or tiles) required to process all of the given buffers
-    BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE);
+    BlockOffsetT num_tiles = ::cuda::ceil_div(num_buffers, TILE_SIZE);
 
     using BlevBufferSrcsOutT          = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<InputBufferIt>>;
     using BlevBufferDstOutT           = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<OutputBufferIt>>;
@@ -528,7 +528,7 @@ struct DispatchBatchMemcpy : SelectedPolicy
     BlevBufferTileOffsetsOutItT d_blev_block_offsets = blev_buffer_block_alloc.get();
 
     // Kernels' grid sizes
-    BlockOffsetT init_grid_size         = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
+    BlockOffsetT init_grid_size         = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
     BlockOffsetT batch_memcpy_grid_size = num_tiles;
 
     // Kernels

diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -99,7 +99,7 @@ struct dispatch_t : PolicyHubT
     constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
 
     const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
-    const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+    const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
     _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
@@ -144,7 +144,7 @@ struct dispatch_t : PolicyHubT
     constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
 
     const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
-    const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+    const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
     _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "

diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -361,7 +361,7 @@ struct dispatch_histogram
 
       // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
       int pixels_per_tile = block_threads * pixels_per_thread;
-      int tiles_per_row   = static_cast<int>(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile));
+      int tiles_per_row   = static_cast<int>(::cuda::ceil_div(num_row_pixels, pixels_per_tile));
       int blocks_per_row  = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
       int blocks_per_col =
         (blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : 0;

diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -238,7 +238,7 @@ struct dispatch_t
       typename choose_merge_agent<merge_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>::
         type;
 
-    const auto num_tiles = cub::DivideAndRoundUp(num_items1 + num_items2, agent_t::policy::ITEMS_PER_TILE);
+    const auto num_tiles = ::cuda::ceil_div(num_items1 + num_items2, agent_t::policy::ITEMS_PER_TILE);
     void* allocations[2] = {nullptr, nullptr};
     {
       const std::size_t merge_partitions_size      = (1 + num_tiles) * sizeof(Offset);
@@ -263,8 +263,7 @@ struct dispatch_t
     {
       const Offset num_partitions               = num_tiles + 1;
       constexpr int threads_per_partition_block = 256; // TODO(bgruber): no policy?
-      const int partition_grid_size =
-        static_cast<int>(cub::DivideAndRoundUp(num_partitions, threads_per_partition_block));
+      const int partition_grid_size = static_cast<int>(::cuda::ceil_div(num_partitions, threads_per_partition_block));
 
       auto error = CubDebug(
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(