Reduce PR testing matrix. (#2436)

* Remove file annotation from verbose matrix warnings. * Allow 'min', 'max', 'minmax' values for matrix `std` tags. * Error when no supported stds available. * Reduce PR testing matrix. 1. Temporarily remove current nightly matrix pending NVKS bringup. 2. Move current per-PR matrix to nightly. 3. Reduce the number of jobs in the PR matrix while maintaining decent coverage. Before: (total jobs: 437) | 320 | `linux-amd64-cpu16` | 66 | `linux-amd64-gpu-v100-latest-1` | 28 | `linux-arm64-cpu16` | 23 | `windows-amd64-cpu16` After (total jobs: 183) | 126 | `linux-amd64-cpu16` | 26 | `linux-amd64-gpu-v100-latest-1` | 21 | `windows-amd64-cpu16` | 10 | `linux-arm64-cpu16` * Restore old build matrix.
NVIDIA · Sep 19, 2024 · ee94bb9 · ee94bb9
1 parent b07f036
commit ee94bb9
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 30 deletions.
diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py
@@ -334,6 +334,8 @@ def lookup_supported_stds(matrix_job):
     if 'project' in matrix_job:
         project = get_project(matrix_job['project'])
         stds = stds & set(project['stds'])
+    if len(stds) == 0:
+        raise Exception(error_message_with_matrix_job(matrix_job, "No supported stds found."))
     return sorted(list(stds))
 
 
@@ -626,18 +628,18 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
                 matching_consumers = merged_consumers[producer_index]
 
                 producer_name = producer['name']
-                print(f"::notice file=ci/matrix.yaml::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'",
+                print(f"::notice::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'",
                       file=sys.stderr)
                 consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
-                print(f"::notice file=ci/matrix.yaml::Original consumers: {consumer_names}", file=sys.stderr)
+                print(f"::notice::Original consumers: {consumer_names}", file=sys.stderr)
                 consumer_names = ", ".join([consumer['name'] for consumer in consumers])
-                print(f"::notice file=ci/matrix.yaml::Duplicate consumers: {consumer_names}", file=sys.stderr)
+                print(f"::notice::Duplicate consumers: {consumer_names}", file=sys.stderr)
                 # Merge if unique:
                 for consumer in consumers:
                     if not dispatch_job_in_container(consumer, matching_consumers):
                         matching_consumers.append(consumer)
                 consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
-                print(f"::notice file=ci/matrix.yaml::Merged consumers: {consumer_names}", file=sys.stderr)
+                print(f"::notice::Merged consumers: {consumer_names}", file=sys.stderr)
             else:
                 merged_producers.append(producer)
                 merged_consumers.append(consumers)
@@ -653,7 +655,7 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
         unique_standalone_jobs = []
         for job_json in standalone_jobs:
             if dispatch_job_in_container(job_json, unique_standalone_jobs):
-                print(f"::notice file=ci/matrix.yaml::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'",
+                print(f"::notice::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'",
                       file=sys.stderr)
             else:
                 unique_standalone_jobs.append(job_json)
@@ -663,12 +665,12 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
         for two_stage_job in two_stage_jobs:
             for producer in two_stage_job['producers']:
                 if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
-                    print(f"::notice file=ci/matrix.yaml::Removing standalone job '{producer['name']}' " +
+                    print(f"::notice::Removing standalone job '{producer['name']}' " +
                           f"as it appears as a producer in '{group_name}'",
                           file=sys.stderr)
             for consumer in two_stage_job['consumers']:
                 if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
-                    print(f"::notice file=ci/matrix.yaml::Removing standalone job '{consumer['name']}' " +
+                    print(f"::notice::Removing standalone job '{consumer['name']}' " +
                           f"as it appears as a consumer in '{group_name}'",
                           file=sys.stderr)
         standalone_jobs = list(unique_standalone_jobs)
@@ -864,8 +866,20 @@ def set_derived_tags(matrix_job):
         gpu = get_gpu(matrix_job['gpu'])
         matrix_job['sm'] = gpu['sm']
 
-    if 'std' in matrix_job and matrix_job['std'] == 'all':
-        matrix_job['std'] = lookup_supported_stds(matrix_job)
+    if 'std' in matrix_job:
+        if matrix_job['std'] == 'all':
+            matrix_job['std'] = lookup_supported_stds(matrix_job)
+        elif matrix_job['std'] == 'min':
+            matrix_job['std'] = min(lookup_supported_stds(matrix_job))
+        elif matrix_job['std'] == 'max':
+            matrix_job['std'] = max(lookup_supported_stds(matrix_job))
+        elif matrix_job['std'] == 'minmax':
+            stds = lookup_supported_stds(matrix_job)
+            if len(stds) == 1:
+                matrix_job['std'] = stds[0]
+            else:
+                matrix_job['std'] = [min(stds), max(stds)]
+
 
     # Add all deps before applying project job maps:
     for job in matrix_job['jobs']:

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
@@ -13,11 +13,17 @@ workflows:
     # Old CTK
     - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc6', 'gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2017']}
     - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'}
-    # Current CTK
+    # Current CTK build-only
     - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
     - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']}
-    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
+    # Current CTK testing:
+    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'minmax', cxx: ['gcc']}
+    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max',    cxx: ['clang', 'msvc']}
+    # Split up cub tests:
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'minmax', cxx: ['gcc']}
+    - {jobs: ['test_lid1',  'test_lid2'], project: ['cub'], std: 'max',    cxx: ['gcc']}
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max',    cxx: ['clang', 'msvc']}
     # Modded builds:
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'}
@@ -36,33 +42,41 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: 17,    cxx: ['gcc12'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 17,    cxx: ['gcc13'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'min', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'max', cxx: ['clang14']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'max', cxx: ['clang18']}
     # Python jobs:
     - {jobs: ['test'], project: 'pycuda', ctk: ['12.5']}
     # cccl-infra:
     - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']}
     - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',  'clang']}
-    # Edge-case jobs
-    - {jobs: ['limited'], project: 'cub', std: 17}
 
   nightly:
-    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',    std: [11]}
-    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]}
-    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]}
-    # H100 runners are currently flakey, only build since those use CPU-only runners:
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
+    # Increased test coverage compared to nightlies:
+    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18']}
+    # Edge-case jobs
+    - {jobs: ['limited'], project: 'cub', std: 17}
 
-   # nvrtc:
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',  std: 'all',    project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc13',  std: [11, 20], project: ['libcudacxx']}
+#  # These are waiting on the NVKS nodes:
+#    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',    std: [11]}
+#    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]}
+#    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
+#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
+#    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
+#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]}
+#    # H100 runners are currently flakey, only build since those use CPU-only runners:
+#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
+#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
+#
+#   # nvrtc:
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',  std: 'all',    project: ['libcudacxx']}
+#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc13',  std: [11, 20], project: ['libcudacxx']}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:
@@ -256,6 +270,7 @@ tags:
   project: { default: ['libcudacxx', 'cub', 'thrust'] }
   # C++ standard
   # If set to 'all', all stds supported by the ctk/compilers/project are used.
+  # If set to 'min', 'max', or 'minmax', the minimum, maximum, or both stds are used.
   # If set, will be passed to script with `-std <std>`.
   std: { required: false }
   # GPU architecture