Experimental Python cooperative algorithms (#1973)

* Python exposure of cooperative algorithms * Update inpect changes to be aware of pycudax. This is a bit more complex since this is the first subproject that doesn't live in a directory with the same name as the project. * Print working directory when running CI commands. * Update CI for pycudax. * Update module name in CI This reverts commit 058bb5f2725a70fa8b4cd8ff39b84e0c4c53c2b0. * [pre-commit.ci] auto code formatting * Remove accidental directory * Fix Thrust pair docs * Fix pkg resource usage --------- Co-authored-by: Allison Piper <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Jul 11, 2024 · 6b95f43 · 6b95f43
1 parent deedb00
commit 6b95f43
Show file tree

Hide file tree

Showing 43 changed files with 3,156 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,7 @@ CMakeUserPresets.json
 /ci/rapids/.conda
 /ci/rapids/.log
 /ci/rapids/.repos
+*.egg-info/
+*.pyc
+__pycache__
+*.pyd
diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh
@@ -26,6 +26,7 @@ subprojects=(
   cub
   thrust
   cudax
+  pycuda
 )
 
 # ...and their dependencies:
@@ -35,6 +36,7 @@ declare -A dependencies=(
   [cub]="cccl libcudacxx thrust"
   [thrust]="cccl libcudacxx cub"
   [cudax]="cccl libcudacxx"
+  [pycuda]="cccl libcudacxx cub thrust cudax"
 )
 
 declare -A project_names=(
@@ -43,21 +45,34 @@ declare -A project_names=(
   [cub]="CUB"
   [thrust]="Thrust"
   [cudax]="CUDA Experimental"
+  [pycuda]="pycuda"
+)
+
+# By default, the project directory is assumed to be the same as the subproject name,
+# but can be overridden here. The `cccl` project is special, and checks for files outside
+# of any subproject directory.
+declare -A project_dirs=(
+  [pycuda]="python/cuda"
 )
 
 # Usage checks:
 for subproject in "${subprojects[@]}"; do
   # Check that the subproject directory exists
-  if [ "$subproject" != "cccl" ] && [ ! -d "$subproject" ]; then
-    echo "Error: Subproject directory '$subproject' does not exist."
-    exit 1
+  if [ "$subproject" != "cccl" ]; then
+    subproject_dir=${project_dirs[$subproject]:-$subproject}
+    if [ ! -d "$subproject_dir" ]; then
+      echo "Error: Subproject '$subproject' directory '$subproject_dir' does not exist."
+      exit 1
+    fi
   fi
 
-  # If the subproject has dependencies, check that they exist (except for "cccl")
+  # If the subproject has dependencies, check that they are valid
   for dependency in ${dependencies[$subproject]}; do
-    if [ "$dependency" != "cccl" ] && [ ! -d "$dependency" ]; then
-      echo "Error: Dependency directory '$dependency' for subproject '$subproject' does not exist."
-      exit 1
+    if [ "$dependency" != "cccl" ]; then
+      if [[ ! " ${subprojects[@]} " =~ " ${dependency} " ]]; then
+        echo "Error: Dependency '$dependency' for subproject '$subproject' does not exist."
+        exit 1
+      fi
     fi
   done
 done
@@ -83,8 +98,17 @@ dirty_files() {
 # Return 1 if any files outside of the subproject directories have changed
 inspect_cccl() {
   exclusions_grep_expr=$(
-    exclusions=("${subprojects[@]}")
+    declare -a exclusions
+    for subproject in "${subprojects[@]}"; do
+      if [[ ${subproject} == "cccl" ]]; then
+        continue
+      fi
+      exclusions+=("${project_dirs[$subproject]:-$subproject}")
+    done
+
+    # Manual exclusions:
     exclusions+=("docs")
+
     IFS="|"
     echo "^(${exclusions[*]})/"
   )
@@ -160,7 +184,6 @@ main() {
   echo "::endgroup::"
   echo
 
-
   echo "<details><summary><h3>👃 Inspect Changes</h3></summary>" | tee_to_step_summary
   echo | tee_to_step_summary
 
@@ -181,7 +204,7 @@ main() {
       continue
     fi
 
-    inspect_subdir $subproject
+    inspect_subdir ${project_dirs[$subproject]:-$subproject}
     local dirty=$?
     declare ${subproject^^}_DIRTY=${dirty}
     checkmark="$(get_checkmark ${dirty})"

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
@@ -40,9 +40,12 @@ workflows:
     - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
     - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
     - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang16']}
+    # Python jobs:
+    - {jobs: ['test'], project: 'pycuda'}
     # cccl-infra:
     - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']}
     - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']}
+
   nightly:
   # libcudacxx build fails, CUB tests fail:
     - {jobs: ['build'], ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',   std: [11],     project: ['cub']}
@@ -176,8 +179,9 @@ host_compilers:
 #     windows: `ci/<spec[prefix]>_<project>.sh <spec[args]>`
 jobs:
   # General:
-  build: { gpu: false }
-  test:  { gpu: true, needs: 'build' }
+  build:        { gpu: false }
+  test:         { gpu: true, needs: 'build' }
+  test_nobuild: { gpu: true, name: 'Test', invoke: { prefix: 'test' } }
 
   # CCCL:
   infra: { gpu: true } # example project launches a kernel
@@ -228,6 +232,9 @@ projects:
     job_map: { test: ['test_cpu', 'test_gpu'] }
   cudax:
     stds: [17, 20]
+  pycuda:
+    name: "cuda (python)"
+    job_map: { build: [], test: ['test_nobuild'] }
 
 # testing -> Runner with GPU is in a nv-gh-runners testing pool
 gpus:

diff --git a/ci/pretty_printing.sh b/ci/pretty_printing.sh
@@ -66,6 +66,7 @@ function run_command() {
     local status
 
     begin_group "$group_name"
+    echo "Working directory: $(pwd)"
     echo "Running command: ${command[*]}"
     set +e
     local start_time=$(date +%s)

diff --git a/ci/test_pycuda.sh b/ci/test_pycuda.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -euo pipefail
+
+source "$(dirname "$0")/build_common.sh"
+
+print_environment_details
+
+fail_if_no_gpu
+
+readonly prefix="${BUILD_DIR}/python/"
+export PYTHONPATH="${prefix}:${PYTHONPATH:-}"
+
+pushd ../python/cuda >/dev/null
+
+run_command "⚙️  Pip install cuda" pip install --force-reinstall --target "${prefix}" .[test]
+run_command "🚀  Pytest cuda" python -m pytest -v ./tests
+
+popd >/dev/null
+
+print_time_summary
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
@@ -145,26 +145,51 @@ CUB_NAMESPACE_BEGIN
 //! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
 //! where each thread owns 4 consecutive items.
 //!
-//! .. code-block:: c++
+//! .. tab-set-code::
 //!
-//!     #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+//!    .. code-block:: c++
 //!
-//!     __global__ void ExampleKernel(...)
-//!     {
-//!         // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
-//!         using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
+//!        #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
 //!
-//!         // Allocate shared memory for BlockRadixSort
-//!         __shared__ typename BlockRadixSort::TempStorage temp_storage;
+//!        __global__ void kernel(...)
+//!        {
+//!            // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+//!            using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
 //!
-//!         // Obtain a segment of consecutive items that are blocked across threads
-//!         int thread_keys[4];
-//!         ...
+//!            // Allocate shared memory for BlockRadixSort
+//!            __shared__ typename BlockRadixSort::TempStorage temp_storage;
 //!
-//!         // Collectively sort the keys
-//!         BlockRadixSort(temp_storage).Sort(thread_keys);
+//!            // Obtain a segment of consecutive items that are blocked across threads
+//!            int thread_keys[4];
+//!            ...
 //!
-//!         ...
+//!            // Collectively sort the keys
+//!            BlockRadixSort(temp_storage).Sort(thread_keys);
+//!
+//!            ...
+//!
+//!    .. code-block:: python
+//!
+//!        import cuda.cooperative.experimental as cudax
+//!        from pynvjitlink import patch
+//!        patch.patch_numba_linker(lto=True)
+//!
+//!        # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
+//!        block_radix_sort = cudax.block.radix_sort_keys(numba.int32, 128, 4)
+//!        temp_storage_bytes = block_radix_sort.temp_storage_bytes
+//!
+//!        @cuda.jit(link=block_radix_sort.files)
+//!        def kernel():
+//!            Allocate shared memory for radix sort
+//!            temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
+//!
+//!            # Obtain a segment of consecutive items that are blocked across threads
+//!            thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
+//!            # ...
+//!
+//!            // Collectively sort the keys
+//!            block_radix_sort(temp_storage, thread_keys)
+//!            # ...
 //!
 //! Suppose the set of input ``thread_keys`` across the block of threads is
 //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.

diff --git a/docs/pycuda/index.rst b/docs/pycuda/index.rst
@@ -0,0 +1,19 @@
+.. _pycudax-module:
+
+CUDA
+==================================================
+
+.. warning::
+  Python exposure of cooperative algorithms is in public beta.
+  The API is subject to change without notice.
+
+.. automodule:: cuda.cooperative.experimental.warp
+  :members:
+  :undoc-members:
+  :imported-members:
+
+
+.. automodule:: cuda.cooperative.experimental.block
+  :members:
+  :undoc-members:
+  :imported-members:
diff --git a/docs/repo.toml b/docs/repo.toml
@@ -25,13 +25,14 @@ sphinx_exclude_patterns = [
     "VERSION.md",
 ]
 
-project_build_order = [ "libcudacxx", "cub", "thrust", "cccl" ]
+project_build_order = [ "libcudacxx", "cub", "thrust", "cccl", "pycuda" ]
 
 # deps can be used to link to other projects' documentation
 deps = [
     [ "libcudacxx", "_build/docs/libcudacxx/latest" ],
     [ "cub", "_build/docs/cub/latest" ],
     [ "thrust", "_build/docs/thrust/latest" ],
+    [ "pycuda", "_build/docs/pycuda/latest" ],
 ]
 
 [repo_docs.projects.libcudacxx]
@@ -278,3 +279,26 @@ doxygen_conf_extra = """
   FULL_PATH_NAMES        = YES
   STRIP_FROM_PATH        = ../../thrust
 """
+
+[repo_docs.projects.pycuda]
+name = "pycuda"
+docs_root = "pycuda"
+logo = "../img/logo.png"
+
+repo_url         = "https://github.com/NVIDIA/cccl/python/cuda"
+social_media_set = ""
+social_media     = [
+    [ "github", "https://github.com/NVIDIA/cccl" ],
+]
+
+autodoc.mock_imports = [
+    "numba",
+    "pynvjitlink",
+    "cuda.nvrtc",
+    "llvmlite"
+]
+
+enhanced_search_enabled = true
+python_paths = [
+    "${root}/../python/cuda"
+]
diff --git a/python/cuda/.gitignore b/python/cuda/.gitignore
@@ -0,0 +1,3 @@
+cuda/_include
+env
+*egg-info
diff --git a/python/cuda/MANIFEST.in b/python/cuda/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include cuda/_include *
diff --git a/python/cuda/README.md b/python/cuda/README.md
@@ -0,0 +1,4 @@
+```bash
+pip3 install -e .[test]
+pytest -v ./tests/device/
+```
diff --git a/python/cuda/cuda/cooperative/__init__.py b/python/cuda/cuda/cooperative/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import cuda.cooperative.experimental
diff --git a/python/cuda/cuda/cooperative/experimental/__init__.py b/python/cuda/cuda/cooperative/experimental/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from cuda.cooperative.experimental import block, warp
+from cuda.cooperative.experimental._types import StatefulFunction
diff --git a/python/cuda/cuda/cooperative/experimental/_caching.py b/python/cuda/cuda/cooperative/experimental/_caching.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import pickle
+import json
+import hashlib
+
+_ENABLE_CACHE = 'CCCL_ENABLE_CACHE' in os.environ
+if _ENABLE_CACHE:
+    _CACHE_LOCATION = os.path.join(os.path.expanduser("~"), ".cache", "cccl")
+    if not os.path.exists(_CACHE_LOCATION):
+        os.makedirs(_CACHE_LOCATION)
+
+# We use
+# json.dumps to serialize args/kwargs to a string
+# hashlib to compute the hash
+def json_hash(*args, **kwargs):
+    hasher = hashlib.sha1()
+    hasher.update(json.dumps([args, kwargs]).encode('utf-8'))
+    return hasher.hexdigest()
+
+def disk_cache(func):
+    def cacher(*args, **kwargs):
+        if _ENABLE_CACHE:
+            # compute hash(args, kwargs)
+            h = json_hash(*args, **kwargs)
+            # if file exist...
+            if os.path.isfile(os.path.join(_CACHE_LOCATION, h)):
+                # open it
+                with open(os.path.join(_CACHE_LOCATION, h), 'rb') as f:
+                    out = pickle.load(f)
+                # return cache
+                return out
+            else:
+                # compute output
+                out = func(*args, **kwargs)
+                # store to file
+                with open(os.path.join(_CACHE_LOCATION, h), 'wb') as f:
+                    pickle.dump(out, f)
+                return out
+        else:
+            return func(*args, **kwargs)
+
+    return cacher