Skip to content

Commit

Permalink
Experimental Python cooperative algorithms (#1973)
Browse files Browse the repository at this point in the history
* Python exposure of cooperative algorithms

* Update inpect changes to be aware of pycudax.

This is a bit more complex since this is the first
subproject that doesn't live in a directory with the same name as the project.

* Print working directory when running CI commands.

* Update CI for pycudax.

* Update module name in CI

This reverts commit 058bb5f2725a70fa8b4cd8ff39b84e0c4c53c2b0.

* [pre-commit.ci] auto code formatting

* Remove accidental directory

* Fix Thrust pair docs

* Fix pkg resource usage

---------

Co-authored-by: Allison Piper <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people committed Jul 11, 2024
1 parent deedb00 commit 6b95f43
Show file tree
Hide file tree
Showing 43 changed files with 3,156 additions and 27 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@ CMakeUserPresets.json
/ci/rapids/.conda
/ci/rapids/.log
/ci/rapids/.repos
*.egg-info/
*.pyc
__pycache__
*.pyd
43 changes: 33 additions & 10 deletions ci/inspect_changes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ subprojects=(
cub
thrust
cudax
pycuda
)

# ...and their dependencies:
Expand All @@ -35,6 +36,7 @@ declare -A dependencies=(
[cub]="cccl libcudacxx thrust"
[thrust]="cccl libcudacxx cub"
[cudax]="cccl libcudacxx"
[pycuda]="cccl libcudacxx cub thrust cudax"
)

declare -A project_names=(
Expand All @@ -43,21 +45,34 @@ declare -A project_names=(
[cub]="CUB"
[thrust]="Thrust"
[cudax]="CUDA Experimental"
[pycuda]="pycuda"
)

# By default, the project directory is assumed to be the same as the subproject name,
# but can be overridden here. The `cccl` project is special, and checks for files outside
# of any subproject directory.
declare -A project_dirs=(
[pycuda]="python/cuda"
)

# Usage checks:
for subproject in "${subprojects[@]}"; do
# Check that the subproject directory exists
if [ "$subproject" != "cccl" ] && [ ! -d "$subproject" ]; then
echo "Error: Subproject directory '$subproject' does not exist."
exit 1
if [ "$subproject" != "cccl" ]; then
subproject_dir=${project_dirs[$subproject]:-$subproject}
if [ ! -d "$subproject_dir" ]; then
echo "Error: Subproject '$subproject' directory '$subproject_dir' does not exist."
exit 1
fi
fi

# If the subproject has dependencies, check that they exist (except for "cccl")
# If the subproject has dependencies, check that they are valid
for dependency in ${dependencies[$subproject]}; do
if [ "$dependency" != "cccl" ] && [ ! -d "$dependency" ]; then
echo "Error: Dependency directory '$dependency' for subproject '$subproject' does not exist."
exit 1
if [ "$dependency" != "cccl" ]; then
if [[ ! " ${subprojects[@]} " =~ " ${dependency} " ]]; then
echo "Error: Dependency '$dependency' for subproject '$subproject' does not exist."
exit 1
fi
fi
done
done
Expand All @@ -83,8 +98,17 @@ dirty_files() {
# Return 1 if any files outside of the subproject directories have changed
inspect_cccl() {
exclusions_grep_expr=$(
exclusions=("${subprojects[@]}")
declare -a exclusions
for subproject in "${subprojects[@]}"; do
if [[ ${subproject} == "cccl" ]]; then
continue
fi
exclusions+=("${project_dirs[$subproject]:-$subproject}")
done
# Manual exclusions:
exclusions+=("docs")
IFS="|"
echo "^(${exclusions[*]})/"
)
Expand Down Expand Up @@ -160,7 +184,6 @@ main() {
echo "::endgroup::"
echo


echo "<details><summary><h3>👃 Inspect Changes</h3></summary>" | tee_to_step_summary
echo | tee_to_step_summary

Expand All @@ -181,7 +204,7 @@ main() {
continue
fi

inspect_subdir $subproject
inspect_subdir ${project_dirs[$subproject]:-$subproject}
local dirty=$?
declare ${subproject^^}_DIRTY=${dirty}
checkmark="$(get_checkmark ${dirty})"
Expand Down
11 changes: 9 additions & 2 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,12 @@ workflows:
- {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang16']}
# Python jobs:
- {jobs: ['test'], project: 'pycuda'}
# cccl-infra:
- {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']}
- {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']}

nightly:
# libcudacxx build fails, CUB tests fail:
- {jobs: ['build'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11], project: ['cub']}
Expand Down Expand Up @@ -176,8 +179,9 @@ host_compilers:
# windows: `ci/<spec[prefix]>_<project>.sh <spec[args]>`
jobs:
# General:
build: { gpu: false }
test: { gpu: true, needs: 'build' }
build: { gpu: false }
test: { gpu: true, needs: 'build' }
test_nobuild: { gpu: true, name: 'Test', invoke: { prefix: 'test' } }

# CCCL:
infra: { gpu: true } # example project launches a kernel
Expand Down Expand Up @@ -228,6 +232,9 @@ projects:
job_map: { test: ['test_cpu', 'test_gpu'] }
cudax:
stds: [17, 20]
pycuda:
name: "cuda (python)"
job_map: { build: [], test: ['test_nobuild'] }

# testing -> Runner with GPU is in a nv-gh-runners testing pool
gpus:
Expand Down
1 change: 1 addition & 0 deletions ci/pretty_printing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ function run_command() {
local status

begin_group "$group_name"
echo "Working directory: $(pwd)"
echo "Running command: ${command[*]}"
set +e
local start_time=$(date +%s)
Expand Down
21 changes: 21 additions & 0 deletions ci/test_pycuda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

set -euo pipefail

source "$(dirname "$0")/build_common.sh"

print_environment_details

fail_if_no_gpu

readonly prefix="${BUILD_DIR}/python/"
export PYTHONPATH="${prefix}:${PYTHONPATH:-}"

pushd ../python/cuda >/dev/null

run_command "⚙️ Pip install cuda" pip install --force-reinstall --target "${prefix}" .[test]
run_command "🚀 Pytest cuda" python -m pytest -v ./tests

popd >/dev/null

print_time_summary
53 changes: 39 additions & 14 deletions cub/cub/block/block_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -145,26 +145,51 @@ CUB_NAMESPACE_BEGIN
//! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//! .. tab-set-code::
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
//! .. code-block:: c++
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
//! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
//!
//! // Allocate shared memory for BlockRadixSort
//! __shared__ typename BlockRadixSort::TempStorage temp_storage;
//! __global__ void kernel(...)
//! {
//! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
//! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_keys[4];
//! ...
//! // Allocate shared memory for BlockRadixSort
//! __shared__ typename BlockRadixSort::TempStorage temp_storage;
//!
//! // Collectively sort the keys
//! BlockRadixSort(temp_storage).Sort(thread_keys);
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_keys[4];
//! ...
//!
//! ...
//! // Collectively sort the keys
//! BlockRadixSort(temp_storage).Sort(thread_keys);
//!
//! ...
//!
//! .. code-block:: python
//!
//! import cuda.cooperative.experimental as cudax
//! from pynvjitlink import patch
//! patch.patch_numba_linker(lto=True)
//!
//! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
//! block_radix_sort = cudax.block.radix_sort_keys(numba.int32, 128, 4)
//! temp_storage_bytes = block_radix_sort.temp_storage_bytes
//!
//! @cuda.jit(link=block_radix_sort.files)
//! def kernel():
//! Allocate shared memory for radix sort
//! temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
//!
//! # Obtain a segment of consecutive items that are blocked across threads
//! thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
//! # ...
//!
//! // Collectively sort the keys
//! block_radix_sort(temp_storage, thread_keys)
//! # ...
//!
//! Suppose the set of input ``thread_keys`` across the block of threads is
//! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
Expand Down
19 changes: 19 additions & 0 deletions docs/pycuda/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.. _pycudax-module:

CUDA
==================================================

.. warning::
Python exposure of cooperative algorithms is in public beta.
The API is subject to change without notice.

.. automodule:: cuda.cooperative.experimental.warp
:members:
:undoc-members:
:imported-members:


.. automodule:: cuda.cooperative.experimental.block
:members:
:undoc-members:
:imported-members:
26 changes: 25 additions & 1 deletion docs/repo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ sphinx_exclude_patterns = [
"VERSION.md",
]

project_build_order = [ "libcudacxx", "cub", "thrust", "cccl" ]
project_build_order = [ "libcudacxx", "cub", "thrust", "cccl", "pycuda" ]

# deps can be used to link to other projects' documentation
deps = [
[ "libcudacxx", "_build/docs/libcudacxx/latest" ],
[ "cub", "_build/docs/cub/latest" ],
[ "thrust", "_build/docs/thrust/latest" ],
[ "pycuda", "_build/docs/pycuda/latest" ],
]

[repo_docs.projects.libcudacxx]
Expand Down Expand Up @@ -278,3 +279,26 @@ doxygen_conf_extra = """
FULL_PATH_NAMES = YES
STRIP_FROM_PATH = ../../thrust
"""

[repo_docs.projects.pycuda]
name = "pycuda"
docs_root = "pycuda"
logo = "../img/logo.png"

repo_url = "https://github.com/NVIDIA/cccl/python/cuda"
social_media_set = ""
social_media = [
[ "github", "https://github.com/NVIDIA/cccl" ],
]

autodoc.mock_imports = [
"numba",
"pynvjitlink",
"cuda.nvrtc",
"llvmlite"
]

enhanced_search_enabled = true
python_paths = [
"${root}/../python/cuda"
]
3 changes: 3 additions & 0 deletions python/cuda/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cuda/_include
env
*egg-info
1 change: 1 addition & 0 deletions python/cuda/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include cuda/_include *
4 changes: 4 additions & 0 deletions python/cuda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
```bash
pip3 install -e .[test]
pytest -v ./tests/device/
```
5 changes: 5 additions & 0 deletions python/cuda/cuda/cooperative/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

import cuda.cooperative.experimental
6 changes: 6 additions & 0 deletions python/cuda/cuda/cooperative/experimental/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

from cuda.cooperative.experimental import block, warp
from cuda.cooperative.experimental._types import StatefulFunction
46 changes: 46 additions & 0 deletions python/cuda/cuda/cooperative/experimental/_caching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

import os
import pickle
import json
import hashlib

_ENABLE_CACHE = 'CCCL_ENABLE_CACHE' in os.environ
if _ENABLE_CACHE:
_CACHE_LOCATION = os.path.join(os.path.expanduser("~"), ".cache", "cccl")
if not os.path.exists(_CACHE_LOCATION):
os.makedirs(_CACHE_LOCATION)

# We use
# json.dumps to serialize args/kwargs to a string
# hashlib to compute the hash
def json_hash(*args, **kwargs):
hasher = hashlib.sha1()
hasher.update(json.dumps([args, kwargs]).encode('utf-8'))
return hasher.hexdigest()

def disk_cache(func):
def cacher(*args, **kwargs):
if _ENABLE_CACHE:
# compute hash(args, kwargs)
h = json_hash(*args, **kwargs)
# if file exist...
if os.path.isfile(os.path.join(_CACHE_LOCATION, h)):
# open it
with open(os.path.join(_CACHE_LOCATION, h), 'rb') as f:
out = pickle.load(f)
# return cache
return out
else:
# compute output
out = func(*args, **kwargs)
# store to file
with open(os.path.join(_CACHE_LOCATION, h), 'wb') as f:
pickle.dump(out, f)
return out
else:
return func(*args, **kwargs)

return cacher
Loading

0 comments on commit 6b95f43

Please sign in to comment.