diff --git a/.clang-format b/.clang-format index 3cd3f6da33..6cba3dca4b 100644 --- a/.clang-format +++ b/.clang-format @@ -25,6 +25,7 @@ AlwaysBreakTemplateDeclarations: Yes AttributeMacros: [ '_CCCL_ALIGNAS_TYPE', '_CCCL_ALIGNAS', + '_CCCL_ALWAYS_INLINE', '_CCCL_CONSTEXPR_CXX14', '_CCCL_CONSTEXPR_CXX17', '_CCCL_CONSTEXPR_CXX20', @@ -32,46 +33,27 @@ AttributeMacros: [ '_CCCL_DEVICE', '_CCCL_FALLTHROUGH', '_CCCL_FORCEINLINE', + '_CCCL_HIDE_FROM_ABI', '_CCCL_HOST_DEVICE', '_CCCL_HOST', '_CCCL_NO_UNIQUE_ADDRESS', '_CCCL_NODISCARD_FRIEND', '_CCCL_NODISCARD', '_CCCL_NORETURN', + '_CCCL_TYPE_VISIBILITY_DEFAULT', '_CCCL_VISIBILITY_HIDDEN', 'CUB_RUNTIME_FUNCTION', 'CUB_DETAIL_KERNEL_ATTRIBUTES', 'THRUST_RUNTIME_FUNCTION', 'THRUST_DETAIL_KERNEL_ATTRIBUTES', '_LIBCUDACXX_ALIGNOF', - '_LIBCUDACXX_ALWAYS_INLINE', '_LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS', - '_LIBCUDACXX_CONSTINIT', '_LIBCUDACXX_DEPRECATED_IN_CXX11', '_LIBCUDACXX_DEPRECATED_IN_CXX14', '_LIBCUDACXX_DEPRECATED_IN_CXX17', '_LIBCUDACXX_DEPRECATED_IN_CXX20', '_LIBCUDACXX_DEPRECATED', - '_LIBCUDACXX_DISABLE_EXTENTSION_WARNING', - '_LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION', - '_LIBCUDACXX_EXPORTED_FROM_ABI', - '_LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS', - '_LIBCUDACXX_HIDDEN', - '_LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1', '_LIBCUDACXX_HIDE_FROM_ABI', - '_LIBCUDACXX_INLINE_VISIBILITY', - '_LIBCUDACXX_INTERNAL_LINKAGE', - '_LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS', - '_LIBCUDACXX_NO_DESTROY', - '_LIBCUDACXX_NO_SANITIZE', - '_LIBCUDACXX_NOALIAS', - '_LIBCUDACXX_OVERRIDABLE_FUNC_VIS', - '_LIBCUDACXX_STANDALONE_DEBUG', - '_LIBCUDACXX_TEMPLATE_DATA_VIS', - '_LIBCUDACXX_TEMPLATE_VIS', - '_LIBCUDACXX_THREAD_SAFETY_ANNOTATION', - '_LIBCUDACXX_USING_IF_EXISTS', - '_LIBCUDACXX_WEAK', ] BinPackArguments: false BinPackParameters: false @@ -108,6 +90,9 @@ IfMacros: [ IndentWrappedFunctionNames: false IncludeBlocks: Regroup IncludeCategories: + - Regex: '^' + Priority: 0x7FFFFFFF + SortPriority: 0x7FFFFFFF - Regex: '^<(cuda/std/detail/__config|cub/config.cuh|thrust/detail/config.h|thrust/system/cuda/config.h)' Priority: 0 SortPriority: 0 diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json deleted file mode 100644 index 61459a25fc..0000000000 --- a/.devcontainer/cuda12.5-gcc10/devcontainer.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.5", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-gcc10", - "CCCL_CUDA_VERSION": "12.5", - "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.5-gcc10" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.5-gcc10" -} diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.6-gcc10/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-gcc12/devcontainer.json rename to .devcontainer/cuda12.6-gcc10/devcontainer.json index 1d16b6aa61..1d49b0ebc5 100644 --- a/.devcontainer/cuda12.5-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc10/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-gcc12", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-gcc10", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.5-gcc12" + "CCCL_HOST_COMPILER_VERSION": "10", + "CCCL_BUILD_INFIX": "cuda12.6-gcc10" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-gcc12" + "name": "cuda12.6-gcc10" } diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.6-gcc11/devcontainer.json similarity index 90% rename from .devcontainer/cuda12.5-gcc11/devcontainer.json rename to .devcontainer/cuda12.6-gcc11/devcontainer.json index 184de8734c..ba1e0ea891 100644 --- a/.devcontainer/cuda12.5-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-gcc11", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-gcc11", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.5-gcc11" + "CCCL_BUILD_INFIX": "cuda12.6-gcc11" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-gcc11" + "name": "cuda12.6-gcc11" } diff --git a/.devcontainer/cuda12.6-gcc12/devcontainer.json b/.devcontainer/cuda12.6-gcc12/devcontainer.json new file mode 100644 index 0000000000..d25796f6cc --- /dev/null +++ b/.devcontainer/cuda12.6-gcc12/devcontainer.json @@ -0,0 +1,53 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.6", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.6-gcc12", + "CCCL_CUDA_VERSION": "12.6", + "CCCL_HOST_COMPILER": "gcc", + "CCCL_HOST_COMPILER_VERSION": "12", + "CCCL_BUILD_INFIX": "cuda12.6-gcc12" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.6-gcc12" +} diff --git a/.devcontainer/cuda12.6-gcc13/devcontainer.json b/.devcontainer/cuda12.6-gcc13/devcontainer.json new file mode 100644 index 0000000000..666f0e6621 --- /dev/null +++ b/.devcontainer/cuda12.6-gcc13/devcontainer.json @@ -0,0 +1,53 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.6", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.6-gcc13", + "CCCL_CUDA_VERSION": "12.6", + "CCCL_HOST_COMPILER": "gcc", + "CCCL_HOST_COMPILER_VERSION": "13", + "CCCL_BUILD_INFIX": "cuda12.6-gcc13" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.6-gcc13" +} diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.6-gcc7/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-gcc9/devcontainer.json rename to .devcontainer/cuda12.6-gcc7/devcontainer.json index 333c11b3cc..0ca9492cd3 100644 --- a/.devcontainer/cuda12.5-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc7/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-gcc9", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-gcc7", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.5-gcc9" + "CCCL_HOST_COMPILER_VERSION": "7", + "CCCL_BUILD_INFIX": "cuda12.6-gcc7" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-gcc9" + "name": "cuda12.6-gcc7" } diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.6-gcc8/devcontainer.json similarity index 90% rename from .devcontainer/cuda12.5-gcc8/devcontainer.json rename to .devcontainer/cuda12.6-gcc8/devcontainer.json index 10b44d31f1..8e3aacd04d 100644 --- a/.devcontainer/cuda12.5-gcc8/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc8/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-gcc8", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-gcc8", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "8", - "CCCL_BUILD_INFIX": "cuda12.5-gcc8" + "CCCL_BUILD_INFIX": "cuda12.6-gcc8" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-gcc8" + "name": "cuda12.6-gcc8" } diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.6-gcc9/devcontainer.json similarity index 88% rename from .devcontainer/cuda12.5-gcc7/devcontainer.json rename to .devcontainer/cuda12.6-gcc9/devcontainer.json index 9d5d356ad5..4c30069c48 100644 --- a/.devcontainer/cuda12.5-gcc7/devcontainer.json +++ b/.devcontainer/cuda12.6-gcc9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-gcc7", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-gcc9", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "7", - "CCCL_BUILD_INFIX": "cuda12.5-gcc7" + "CCCL_HOST_COMPILER_VERSION": "9", + "CCCL_BUILD_INFIX": "cuda12.6-gcc9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-gcc7" + "name": "cuda12.6-gcc9" } diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.6-llvm10/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-llvm12/devcontainer.json rename to .devcontainer/cuda12.6-llvm10/devcontainer.json index e1cbc4ecb7..59b03b60d4 100644 --- a/.devcontainer/cuda12.5-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm10/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm12", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm10", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.5-llvm12" + "CCCL_HOST_COMPILER_VERSION": "10", + "CCCL_BUILD_INFIX": "cuda12.6-llvm10" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm12" + "name": "cuda12.6-llvm10" } diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.6-llvm11/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-llvm13/devcontainer.json rename to .devcontainer/cuda12.6-llvm11/devcontainer.json index 6fbbf56b79..8907106550 100644 --- a/.devcontainer/cuda12.5-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm13", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm11", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.5-llvm13" + "CCCL_HOST_COMPILER_VERSION": "11", + "CCCL_BUILD_INFIX": "cuda12.6-llvm11" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm13" + "name": "cuda12.6-llvm11" } diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.6-llvm12/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-llvm14/devcontainer.json rename to .devcontainer/cuda12.6-llvm12/devcontainer.json index b8528e989f..522fd7fb80 100644 --- a/.devcontainer/cuda12.5-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm12/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm14", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm12", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "14", - "CCCL_BUILD_INFIX": "cuda12.5-llvm14" + "CCCL_HOST_COMPILER_VERSION": "12", + "CCCL_BUILD_INFIX": "cuda12.6-llvm12" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm14" + "name": "cuda12.6-llvm12" } diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.6-llvm13/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-llvm15/devcontainer.json rename to .devcontainer/cuda12.6-llvm13/devcontainer.json index 768d3163ee..bc9f36b98b 100644 --- a/.devcontainer/cuda12.5-llvm15/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm13/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm15-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm15", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm13", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "15", - "CCCL_BUILD_INFIX": "cuda12.5-llvm15" + "CCCL_HOST_COMPILER_VERSION": "13", + "CCCL_BUILD_INFIX": "cuda12.6-llvm13" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm15" + "name": "cuda12.6-llvm13" } diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.6-llvm14/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-llvm16/devcontainer.json rename to .devcontainer/cuda12.6-llvm14/devcontainer.json index 8ba700fa4e..bf43444cb5 100644 --- a/.devcontainer/cuda12.5-llvm16/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm14/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm16-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm16", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm14", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "16", - "CCCL_BUILD_INFIX": "cuda12.5-llvm16" + "CCCL_HOST_COMPILER_VERSION": "14", + "CCCL_BUILD_INFIX": "cuda12.6-llvm14" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm16" + "name": "cuda12.6-llvm14" } diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.6-llvm15/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-llvm17/devcontainer.json rename to .devcontainer/cuda12.6-llvm15/devcontainer.json index 0de5689fdc..a6228c43a0 100644 --- a/.devcontainer/cuda12.5-llvm17/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm15/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm17-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm15-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm17", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm15", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "17", - "CCCL_BUILD_INFIX": "cuda12.5-llvm17" + "CCCL_HOST_COMPILER_VERSION": "15", + "CCCL_BUILD_INFIX": "cuda12.6-llvm15" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm17" + "name": "cuda12.6-llvm15" } diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.6-llvm16/devcontainer.json similarity index 88% rename from .devcontainer/cuda12.5-llvm11/devcontainer.json rename to .devcontainer/cuda12.6-llvm16/devcontainer.json index a216720e5d..e0c8fd3212 100644 --- a/.devcontainer/cuda12.5-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm16/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm16-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm11", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm16", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.5-llvm11" + "CCCL_HOST_COMPILER_VERSION": "16", + "CCCL_BUILD_INFIX": "cuda12.6-llvm16" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm11" + "name": "cuda12.6-llvm16" } diff --git a/.devcontainer/cuda12.6-llvm17/devcontainer.json b/.devcontainer/cuda12.6-llvm17/devcontainer.json new file mode 100644 index 0000000000..1920aa035d --- /dev/null +++ b/.devcontainer/cuda12.6-llvm17/devcontainer.json @@ -0,0 +1,53 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:24.10-cpp-llvm17-cuda12.6", + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.6-llvm17", + "CCCL_CUDA_VERSION": "12.6", + "CCCL_HOST_COMPILER": "llvm", + "CCCL_HOST_COMPILER_VERSION": "17", + "CCCL_BUILD_INFIX": "cuda12.6-llvm17" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.defaultFormatter": "xaver.clang-format", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true + } + } + }, + "name": "cuda12.6-llvm17" +} diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.6-llvm18/devcontainer.json similarity index 88% rename from .devcontainer/cuda12.5-llvm10/devcontainer.json rename to .devcontainer/cuda12.6-llvm18/devcontainer.json index 8e3e19d4fc..80e92119f7 100644 --- a/.devcontainer/cuda12.5-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm18/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm18-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm10", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm18", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.5-llvm10" + "CCCL_HOST_COMPILER_VERSION": "18", + "CCCL_BUILD_INFIX": "cuda12.6-llvm18" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm10" + "name": "cuda12.6-llvm18" } diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.6-llvm9/devcontainer.json similarity index 90% rename from .devcontainer/cuda12.5-llvm9/devcontainer.json rename to .devcontainer/cuda12.6-llvm9/devcontainer.json index d34ae01844..6ef30d1657 100644 --- a/.devcontainer/cuda12.5-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.6-llvm9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-llvm9", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-llvm9", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.5-llvm9" + "CCCL_BUILD_INFIX": "cuda12.6-llvm9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-llvm9" + "name": "cuda12.6-llvm9" } diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json rename to .devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json index a530527cac..a7c7e3cdff 100644 --- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json +++ b/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-oneapi2023.2.0-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-oneapi2023.2.0-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-oneapi2023.2.0", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-oneapi2023.2.0", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "oneapi", "CCCL_HOST_COMPILER_VERSION": "2023.2.0", - "CCCL_BUILD_INFIX": "cuda12.5-oneapi2023.2.0" + "CCCL_BUILD_INFIX": "cuda12.6-oneapi2023.2.0" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-oneapi2023.2.0" + "name": "cuda12.6-oneapi2023.2.0" } diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 0f3fbb36f5..666f0e6621 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.6", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.5-gcc13", - "CCCL_CUDA_VERSION": "12.5", + "DEVCONTAINER_NAME": "cuda12.6-gcc13", + "CCCL_CUDA_VERSION": "12.6", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.5-gcc13" + "CCCL_BUILD_INFIX": "cuda12.6-gcc13" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", @@ -49,5 +49,5 @@ } } }, - "name": "cuda12.5-gcc13" + "name": "cuda12.6-gcc13" } diff --git a/.github/actions/docs-build/action.yml b/.github/actions/docs-build/action.yml index 8b997f4741..db7f323174 100644 --- a/.github/actions/docs-build/action.yml +++ b/.github/actions/docs-build/action.yml @@ -38,6 +38,8 @@ runs: cp -rf ./docs/_build/docs/cudax/latest/* _site/cudax mkdir _site/cuda_cooperative cp -rf ./docs/_build/docs/cuda_cooperative/latest/* _site/cuda_cooperative + mkdir _site/cuda_parallel + cp -rf ./docs/_build/docs/cuda_parallel/latest/* _site/cuda_parallel ./docs/scrape_docs.bash ./_site # Update docs as workflow artifact: diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py index a3b216e3fd..cd2aad01d9 100755 --- a/.github/actions/workflow-build/build-workflow.py +++ b/.github/actions/workflow-build/build-workflow.py @@ -580,6 +580,14 @@ def remove_dispatch_job_from_container(job, container): return False +def index_of_dispatch_job_in_container(job, container): + "Find the index of a dispatch job in a container, using compare_dispatch_jobs." + for idx, job2 in enumerate(container): + if compare_dispatch_jobs(job, job2): + return idx + return None + + def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig): workflow_dispatch_groups = copy.deepcopy(workflow_dispatch_groups_orig) @@ -614,7 +622,7 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig): producer = producers[0] if dispatch_job_in_container(producer, merged_producers): - producer_index = merged_producers.index(producers) + producer_index = index_of_dispatch_job_in_container(producer, merged_producers) matching_consumers = merged_consumers[producer_index] producer_name = producer['name'] diff --git a/CMakeLists.txt b/CMakeLists.txt index 198727dc5d..bddb6f1a73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,7 @@ option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ${CCCL_TOPLEVEL_P option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ${CCCL_TOPLEVEL_PROJECT}) option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ${CCCL_TOPLEVEL_PROJECT}) option(CCCL_ENABLE_BENCHMARKS "Enable CUDA C++ Core Library benchmarks." OFF) +option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF) option(CCCL_ENABLE_UNSTABLE "Enable targets and developer build options for unstable projects." OFF) @@ -45,6 +46,11 @@ if (CCCL_ENABLE_UNSTABLE) option(CCCL_ENABLE_CUDAX "Enable the CUDA Experimental developer build." ON) endif() +option(CCCL_DISABLE_EXCEPTIONS "Disable use of exceptions within CCCL libraries." OFF) +if (CCCL_DISABLE_EXCEPTIONS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCCCL_DISABLE_EXCEPTIONS") +endif() + include(CTest) enable_testing() @@ -77,6 +83,10 @@ if (CCCL_ENABLE_UNSTABLE) add_subdirectory(cudax) endif() +if (CCCL_ENABLE_C) + add_subdirectory(c) +endif() + if (CCCL_ENABLE_TESTING) add_subdirectory(test) endif() diff --git a/CMakePresets.json b/CMakePresets.json index 10bdd83539..ecc9b22761 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -38,6 +38,7 @@ "CCCL_ENABLE_TESTING": true, "CCCL_ENABLE_EXAMPLES": true, "CCCL_ENABLE_BENCHMARKS": true, + "CCCL_ENABLE_C": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true, "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b28e0b9ea2..f2088f4338 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,6 +3,9 @@ Thank you for your interest in contributing to the CUDA C++ Core Libraries (CCCL)! +Looking for ideas for your first contribution? Check out: ![GitHub Issues or Pull Requests by label](https://img.shields.io/github/issues/nvidia/cccl/good%20first%20issue) + + ## Getting Started 1. **Fork & Clone the Repository**: diff --git a/benchmarks/scripts/cccl/bench/bench.py b/benchmarks/scripts/cccl/bench/bench.py index f93f2eff57..049dcbb601 100644 --- a/benchmarks/scripts/cccl/bench/bench.py +++ b/benchmarks/scripts/cccl/bench/bench.py @@ -19,7 +19,7 @@ def first_val(my_dict): first_value = values[0] if not all(value == first_value for value in values): - raise ValueError('All values in the dictionary are not equal') + raise ValueError('All values in the dictionary are not equal. First value: {} All values: {}'.format(first_value, values)) return first_value @@ -648,11 +648,11 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True): p.wait(timeout=timeout) elapsed = time.time() - begin - logger.info("finished benchmark {} with {} ({}) in {}s".format(self.label(), ct_point, p.returncode, elapsed)) + logger.info("finished benchmark {} with {} ({}) in {:.3f}s".format(self.label(), ct_point, p.returncode, elapsed)) return BenchResult(result_path, p.returncode, elapsed) except subprocess.TimeoutExpired: - logger.info("benchmark {} with {} reached timeout of {}s".format(self.label(), ct_point, timeout)) + logger.info("benchmark {} with {} reached timeout of {:.3f}s".format(self.label(), ct_point, timeout)) os.killpg(os.getpgid(p.pid), signal.SIGTERM) return BenchResult(None, 42, float('inf')) diff --git a/benchmarks/scripts/cccl/bench/cmake.py b/benchmarks/scripts/cccl/bench/cmake.py index 095531a005..4340c999c6 100644 --- a/benchmarks/scripts/cccl/bench/cmake.py +++ b/benchmarks/scripts/cccl/bench/cmake.py @@ -80,7 +80,7 @@ def do_build(self, bench, timeout): stderr=subprocess.DEVNULL) p.wait(timeout=timeout) elapsed = time.time() - begin - logger.info("finished build for {} ({}) in {}s".format(bench.label(), p.returncode, elapsed)) + logger.info("finished build for {} (exit code: {}) in {:.3f}s".format(bench.label(), p.returncode, elapsed)) return Build(p.returncode, elapsed) except subprocess.TimeoutExpired: diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt new file mode 100644 index 0000000000..3e3783903b --- /dev/null +++ b/c/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.30) + +project(cccl.c LANGUAGES CUDA CXX) + +add_library(cccl.c SHARED src/reduce.cu) +set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20) +set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20) + +find_package(CUDAToolkit REQUIRED) + +# TODO Use static versions of cudart, nvrtc, and nvJitLink +target_link_libraries(cccl.c PRIVATE CUDA::cudart + CUDA::nvrtc + CUDA::nvJitLink + CUDA::cuda_driver) +target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1) +target_include_directories(cccl.c PUBLIC "include") + +add_subdirectory(test) diff --git a/c/include/cccl/reduce.h b/c/include/cccl/reduce.h new file mode 100644 index 0000000000..5047625a85 --- /dev/null +++ b/c/include/cccl/reduce.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning." +#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv + +# include + +# include + +struct cccl_device_reduce_build_result_t +{ + int cc; + void* cubin; + size_t cubin_size; + CUlibrary library; + CUkernel single_tile_kernel; + CUkernel single_tile_second_kernel; + CUkernel reduction_kernel; +}; + +// TODO return a union of nvtx/cuda/nvrtc errors or a string? +extern "C" CCCL_C_API CUresult cccl_device_reduce_build( + cccl_device_reduce_build_result_t* build, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_reduce( + cccl_device_reduce_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr); + +#endif // CCCL_C_EXPERIMENTAL diff --git a/c/include/cccl/types.h b/c/include/cccl/types.h new file mode 100644 index 0000000000..781b9f9ea6 --- /dev/null +++ b/c/include/cccl/types.h @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning." +#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv + +# if defined(_WIN32) +# define CCCL_C_API __declspec(dllexport) +# else +# define CCCL_C_API __attribute__((visibility("default"))) +# endif + +enum class cccl_type_enum +{ + INT8 = 0, + INT16 = 1, + INT32 = 2, + INT64 = 3, + UINT8 = 4, + UINT16 = 5, + UINT32 = 6, + UINT64 = 7, + FLOAT32 = 8, + FLOAT64 = 9, + STORAGE = 10 +}; + +struct cccl_type_info +{ + int size; + int alignment; + cccl_type_enum type; +}; + +enum class cccl_op_kind_t +{ + stateless = 0, + stateful = 1 +}; + +struct cccl_op_t +{ + cccl_op_kind_t type; + const char* name; + const char* ltoir; + int ltoir_size; + int size; + int alignment; + void* state; +}; + +enum class cccl_iterator_kind_t +{ + pointer = 0, + iterator = 1 +}; + +struct cccl_value_t +{ + cccl_type_info type; + void* state; +}; + +struct cccl_iterator_t +{ + int size; + int alignment; + cccl_iterator_kind_t type; + cccl_op_t advance; + cccl_op_t dereference; + cccl_type_info value_type; + void* state; +}; + +#endif // CCCL_C_EXPERIMENTAL diff --git a/c/src/reduce.cu b/c/src/reduce.cu new file mode 100644 index 0000000000..4badcd1ff0 --- /dev/null +++ b/c/src/reduce.cu @@ -0,0 +1,864 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include + +void check(nvrtcResult result) +{ + if (result != NVRTC_SUCCESS) + { + throw std::runtime_error(std::string("NVRTC error: ") + nvrtcGetErrorString(result)); + } +} + +void check(CUresult result) +{ + if (result != CUDA_SUCCESS) + { + const char* str = nullptr; + cuGetErrorString(result, &str); + throw std::runtime_error(std::string("CUDA error: ") + str); + } +} + +void check(nvJitLinkResult result) +{ + if (result != NVJITLINK_SUCCESS) + { + throw std::runtime_error(std::string("nvJitLink error: ") + std::to_string(result)); + } +} + +struct op_wrapper; +struct device_reduce_policy; +using TransformOpT = ::cuda::std::__identity; +using OffsetT = unsigned long long; +static_assert(std::is_same_v, OffsetT>, "OffsetT must be size_t"); + +struct nothing_t +{}; + +struct runtime_tuning_policy +{ + int block_size; + int items_per_thread; + int vector_load_length; +}; + +struct storage_t; +struct input_iterator_state_t; +struct output_iterator_t; + +char const* cccl_type_enum_to_string(cccl_type_enum type) +{ + switch (type) + { + case cccl_type_enum::INT8: + return "::cuda::std::int8_t"; + case cccl_type_enum::INT16: + return "::cuda::std::int16_t"; + case cccl_type_enum::INT32: + return "::cuda::std::int32_t"; + case cccl_type_enum::INT64: + return "::cuda::std::int64_t"; + case cccl_type_enum::UINT8: + return "::cuda::std::uint8_t"; + case cccl_type_enum::UINT16: + return "::cuda::std::uint16_t"; + case cccl_type_enum::UINT32: + return "::cuda::std::uint32_t"; + case cccl_type_enum::UINT64: + return "::cuda::std::uint64_t"; + case cccl_type_enum::FLOAT32: + return "float"; + case cccl_type_enum::FLOAT64: + return "double"; + case cccl_type_enum::STORAGE: + return "storage_t"; + } + return "unknown"; +} + +std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false) +{ + std::string result; + + if (is_pointer) + { + switch (type) + { + case cccl_type_enum::INT8: + + check(nvrtcGetTypeName<::cuda::std::int8_t*>(&result)); + break; + case cccl_type_enum::INT16: + check(nvrtcGetTypeName<::cuda::std::int16_t*>(&result)); + break; + case cccl_type_enum::INT32: + check(nvrtcGetTypeName<::cuda::std::int32_t*>(&result)); + break; + case cccl_type_enum::INT64: + check(nvrtcGetTypeName<::cuda::std::int64_t*>(&result)); + break; + case cccl_type_enum::UINT8: + check(nvrtcGetTypeName<::cuda::std::uint8_t*>(&result)); + break; + case cccl_type_enum::UINT16: + check(nvrtcGetTypeName<::cuda::std::uint16_t*>(&result)); + break; + case cccl_type_enum::UINT32: + check(nvrtcGetTypeName<::cuda::std::uint32_t*>(&result)); + break; + case cccl_type_enum::UINT64: + check(nvrtcGetTypeName<::cuda::std::uint64_t*>(&result)); + break; + case cccl_type_enum::FLOAT32: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::FLOAT64: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::STORAGE: + check(nvrtcGetTypeName(&result)); + break; + } + } + else + { + switch (type) + { + case cccl_type_enum::INT8: + check(nvrtcGetTypeName<::cuda::std::int8_t>(&result)); + break; + case cccl_type_enum::INT16: + check(nvrtcGetTypeName<::cuda::std::int16_t>(&result)); + break; + case cccl_type_enum::INT32: + check(nvrtcGetTypeName<::cuda::std::int32_t>(&result)); + break; + case cccl_type_enum::INT64: + check(nvrtcGetTypeName<::cuda::std::int64_t>(&result)); + break; + case cccl_type_enum::UINT8: + check(nvrtcGetTypeName<::cuda::std::uint8_t>(&result)); + break; + case cccl_type_enum::UINT16: + check(nvrtcGetTypeName<::cuda::std::uint16_t>(&result)); + break; + case cccl_type_enum::UINT32: + check(nvrtcGetTypeName<::cuda::std::uint32_t>(&result)); + break; + case cccl_type_enum::UINT64: + check(nvrtcGetTypeName<::cuda::std::uint64_t>(&result)); + break; + case cccl_type_enum::FLOAT32: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::FLOAT64: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::STORAGE: + check(nvrtcGetTypeName(&result)); + break; + } + } + + return result; +} + +struct reduce_tuning_t +{ + int cc; + int block_size; + int items_per_thread; + int vector_load_length; +}; + +template +reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N]) +{ + for (const reduce_tuning_t& tuning : tunings) + { + if (cc >= tuning.cc) + { + return tuning; + } + } + + return tunings[N - 1]; +} + +runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type) +{ + reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}}; + + auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain); + + // Implement part of MemBoundScaling + items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2)); + block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); + + return {block_size, items_per_thread, vector_load_length}; +} + +cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init) +{ + // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet + // so switching back to the old accumulator type logic for now + return init.type; +} + +cudaError_t InvokeSingleTile( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + int cc, + CUkernel single_tile_kernel, + CUstream stream) +{ + const runtime_tuning_policy policy = get_policy(cc, d_in.value_type, d_in.value_type); + + cudaError error = cudaSuccess; + do + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + break; + } + + nothing_t nothing{}; + TransformOpT transform_op{}; + void* op_state = op.type == cccl_op_kind_t::stateless ? ¬hing : op.state; + void* in_ptr = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state; + void* out_ptr = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state; + void* args[] = {in_ptr, out_ptr, &num_items, op_state, init.state, &transform_op}; + + check(cuLaunchKernel((CUfunction) single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, args, 0)); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } + } while (0); + + return error; +} + +cudaError_t InvokePasses( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + int cc, + CUkernel reduce_kernel, + CUkernel single_tile_kernel, + CUdevice device, + CUstream stream) +{ + const cccl_type_info accum_t = get_accumulator_type(op, d_in, init); + const runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type); + + cudaError error = cudaSuccess; + do + { + void* in_ptr = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state; + void* out_ptr = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state; + + // Get SM count + int sm_count; + check(cuDeviceGetAttribute(&sm_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); + + // Init regular kernel configuration + const auto tile_size = policy.block_size * policy.items_per_thread; + + // Older drivers have issues handling CUkernel in the occupancy queries, get the CUfunction instead. + // Assumes that the current device is properly set, it needs to be set for the occupancy queries anyway + CUfunction reduce_kernel_fn; + check(cuKernelGetFunction(&reduce_kernel_fn, reduce_kernel)); + + int sm_occupancy = 1; + check(cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, reduce_kernel_fn, policy.block_size, 0)); + + int reduce_device_occupancy = sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0); + cub::GridEvenShare even_share; + even_share.DispatchInit(num_items, max_blocks, tile_size); + + // Temporary storage allocation requirements + void* allocations[1] = {}; + size_t allocation_sizes[1] = { + max_blocks * static_cast(d_in.value_type.size) // bytes needed for privatized block reductions + }; + + // Alias the temporary allocations from the single storage blob (or + // compute the necessary size of the blob) + error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + break; + } + + if (d_temp_storage == nullptr) + { + // Return if the caller is simply requesting the size of the storage + // allocation + return cudaSuccess; + } + + // Get grid size for device_reduce_sweep_kernel + OffsetT reduce_grid_size = even_share.grid_size; + + // Invoke DeviceReduceKernel + // reduce_kernel<<>>( + // d_in, d_block_reductions, num_items, even_share, ReductionOpT{}, TransformOpT{}); + + nothing_t nothing{}; + void* op_state = op.type == cccl_op_kind_t::stateless ? ¬hing : op.state; + + TransformOpT transform_op{}; + void* reduce_args[] = {in_ptr, &allocations[0], &num_items, &even_share, op_state, &transform_op}; + + check(cuLaunchKernel( + (CUfunction) reduce_kernel, reduce_grid_size, 1, 1, policy.block_size, 1, 1, 0, stream, reduce_args, 0)); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } + + // single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS>>>( + // d_block_reductions, d_out, reduce_grid_size, ReductionOpT{}, 0, TransformOpT{}); + + void* single_tile_kernel_args[] = {&allocations[0], out_ptr, &reduce_grid_size, op_state, init.state, &transform_op}; + + check(cuLaunchKernel( + (CUfunction) single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, single_tile_kernel_args, 0)); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } + } while (0); + + return error; +} + +cudaError_t Invoke( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + OffsetT num_items, + cccl_op_t op, + cccl_value_t init, + int cc, + CUkernel single_tile_kernel, + CUkernel single_tile_second_kernel, + CUkernel reduce_kernel, + CUdevice device, + CUstream stream) +{ + const cccl_type_info accum_t = get_accumulator_type(op, d_in, init); + runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type); + + // Force kernel code-generation in all compiler passes + if (num_items <= (policy.block_size * policy.items_per_thread)) + { + // Small, single tile size + return InvokeSingleTile( + d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, op, init, cc, single_tile_kernel, stream); + } + else + { + // Multi-tile pass + return InvokePasses( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + op, + init, + cc, + reduce_kernel, + single_tile_second_kernel, + device, + stream); + } +} + +std::string get_input_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string get_output_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string get_single_tile_kernel_name( + cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init, bool is_second_kernel) +{ + std::string chained_policy_t; + check(nvrtcGetTypeName(&chained_policy_t)); + + const cccl_type_info accum_t = get_accumulator_type(op, input_it, init); + const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); + const std::string input_iterator_t = + is_second_kernel ? cccl_type_enum_to_name(accum_t.type, true) + : input_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(input_it.value_type.type, true) // + : get_input_iterator_name(); + const std::string output_iterator_t = + output_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(output_it.value_type.type, true) // + : get_output_iterator_name(); + const std::string init_t = cccl_type_enum_to_name(init.type.type); + + std::string offset_t; + check(nvrtcGetTypeName(&offset_t)); + + std::string reduction_op_t; + check(nvrtcGetTypeName(&reduction_op_t)); + + return std::format( + "cub::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>", + chained_policy_t, + input_iterator_t, + output_iterator_t, + offset_t, + reduction_op_t, + init_t, + accum_cpp_t); +} + +std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init) +{ + std::string chained_policy_t; + check(nvrtcGetTypeName(&chained_policy_t)); + + const std::string input_iterator_t = + input_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(input_it.value_type.type, true) // + : get_input_iterator_name(); + + const std::string accum_t = cccl_type_enum_to_name(get_accumulator_type(op, input_it, init).type); + + std::string offset_t; + check(nvrtcGetTypeName(&offset_t)); + + std::string reduction_op_t; + check(nvrtcGetTypeName(&reduction_op_t)); + + std::string transform_op_t; + check(nvrtcGetTypeName(&transform_op_t)); + + return std::format( + "cub::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>", + chained_policy_t, + input_iterator_t, + offset_t, + reduction_op_t, + accum_t, + transform_op_t); +} + +bool try_push_context() +{ + CUcontext context = nullptr; + + check(cuCtxGetCurrent(&context)); + + if (context == nullptr) + { + const int default_device = 0; + check(cuDevicePrimaryCtxRetain(&context, default_device)); + check(cuCtxPushCurrent(context)); + + return true; + } + + return false; +} + +extern "C" CCCL_C_API CUresult cccl_device_reduce_build( + cccl_device_reduce_build_result_t* build, + cccl_iterator_t input_it, + cccl_iterator_t output_it, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept +{ + CUresult error = CUDA_SUCCESS; + + try + { + nvrtcProgram prog{}; + const char* name = "test"; + + const int cc = cc_major * 10 + cc_minor; + const cccl_type_info accum_t = get_accumulator_type(op, input_it, init); + const std::string accum_cpp = cccl_type_enum_to_string(accum_t.type); + const runtime_tuning_policy policy = get_policy(cc, accum_t, input_it.value_type); + const std::string input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); + const std::string offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); + + const std::string input_iterator_src = + input_it.type == cccl_iterator_kind_t::pointer + ? std::string{} + : std::format( + "extern \"C\" __device__ {3} {4}(const void *self_ptr);\n" + "extern \"C\" __device__ void {5}(void *self_ptr, {0} offset);\n" + "struct __align__({2}) input_iterator_state_t {{\n;" + " using iterator_category = cuda::std::random_access_iterator_tag;\n" + " using value_type = {3};\n" + " using difference_type = {0};\n" + " using pointer = {3}*;\n" + " using reference = {3}&;\n" + " __device__ value_type operator*() const {{ return {4}(this); }}\n" + " __device__ input_iterator_state_t& operator+=(difference_type diff) {{\n" + " {5}(this, diff);\n" + " return *this;\n" + " }}\n" + " __device__ value_type operator[](difference_type diff) const {{\n" + " return *(*this + diff);\n" + " }}\n" + " __device__ input_iterator_state_t operator+(difference_type diff) const {{\n" + " input_iterator_state_t result = *this;\n" + " result += diff;\n" + " return result;\n" + " }}\n" + " char data[{1}];\n" + "}};\n", + offset_t, // 0 + input_it.size, // 1 + input_it.alignment, // 2 + input_it_value_t, // 3 + input_it.dereference.name, // 4 + input_it.advance.name); // 5 + + const std::string output_iterator_src = + output_it.type == cccl_iterator_kind_t::pointer + ? std::string{} + : std::format( + "extern \"C\" __device__ void {2}(const void *self_ptr, {1} x);\n" + "extern \"C\" __device__ void {3}(void *self_ptr, {0} offset);\n" + "struct __align__({5}) output_iterator_state_t{{\n" + " char data[{4}];\n" + "}};\n" + "struct output_iterator_proxy_t {{\n" + " __device__ output_iterator_proxy_t operator=({1} x) {{\n" + " {2}(&state, x);\n" + " return *this;\n" + " }}\n" + " output_iterator_state_t state;\n" + "}};\n" + "struct output_iterator_t {{\n" + " using iterator_category = cuda::std::random_access_iterator_tag;\n" + " using difference_type = {0};\n" + " using value_type = void;\n" + " using pointer = output_iterator_proxy_t*;\n" + " using reference = output_iterator_proxy_t;\n" + " __device__ output_iterator_proxy_t operator*() const {{ return {{state}}; }}\n" + " __device__ output_iterator_t& operator+=(difference_type diff) {{\n" + " {3}(&state, diff);\n" + " return *this;\n" + " }}\n" + " __device__ output_iterator_proxy_t operator[](difference_type diff) const {{\n" + " output_iterator_t result = *this;\n" + " result += diff;\n" + " return {{ result.state }};\n" + " }}\n" + " __device__ output_iterator_t operator+(difference_type diff) const {{\n" + " output_iterator_t result = *this;\n" + " result += diff;\n" + " return result;\n" + " }}\n" + " output_iterator_state_t state;\n" + "}};", + offset_t, // 0 + accum_cpp, // 1 + output_it.dereference.name, // 2 + output_it.advance.name, // 3 + output_it.size, // 4 + output_it.alignment); // 5 + + const std::string op_src = + op.type == cccl_op_kind_t::stateless + ? std::format( + "extern \"C\" __device__ {0} {1}({0} lhs, {0} rhs);\n" + "struct op_wrapper {{\n" + " __device__ {0} operator()({0} lhs, {0} rhs) const {{\n" + " return {1}(lhs, rhs);\n" + " }}\n" + "}};\n", + accum_cpp, + op.name) + : std::format( + "struct __align__({2}) op_state {{\n" + " char data[{3}];\n" + "}};" + "extern \"C\" __device__ {0} {1}(op_state *state, {0} lhs, {0} rhs);\n" + "struct op_wrapper {{\n" + " op_state state;\n" + " __device__ {0} operator()({0} lhs, {0} rhs) {{\n" + " return {1}(&state, lhs, rhs);\n" + " }}\n" + "}};\n", + accum_cpp, + op.name, + op.alignment, + op.size); + + const std::string src = std::format( + "#include \n" + "#include \n" + "struct __align__({1}) storage_t {{\n" + " char data[{0}];\n" + "}};\n" + "{4}\n" + "{5}\n" + "struct agent_policy_t {{\n" + " static constexpr int ITEMS_PER_THREAD = {2};\n" + " static constexpr int BLOCK_THREADS = {3};\n" + " static constexpr int VECTOR_LOAD_LENGTH = {7};\n" + " static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS;\n" + " static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_LDG;\n" + "}};\n" + "struct device_reduce_policy {{\n" + " struct ActivePolicy {{\n" + " using ReducePolicy = agent_policy_t;\n" + " using SingleTilePolicy = agent_policy_t;\n" + " }};\n" + "}};\n" + "{6};\n", + input_it.value_type.size, // 0 + input_it.value_type.alignment, // 1 + policy.items_per_thread, // 2 + policy.block_size, // 3 + input_iterator_src, // 4 + output_iterator_src, // 5 + op_src, // 6 + policy.vector_load_length); // 7 + + check(nvrtcCreateProgram(&prog, src.c_str(), name, 0, nullptr, nullptr)); + + std::string single_tile_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, false); + check(nvrtcAddNameExpression(prog, single_tile_kernel_name.c_str())); + + std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true); + check(nvrtcAddNameExpression(prog, single_tile_second_kernel_name.c_str())); + + std::string reduction_kernel_name = get_device_reduce_kernel_name(op, input_it, init); + check(nvrtcAddNameExpression(prog, reduction_kernel_name.c_str())); + + const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor); + + constexpr int num_args = 7; + const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"}; + + std::size_t log_size{}; + nvrtcResult compile_result = nvrtcCompileProgram(prog, num_args, args); + + check(nvrtcGetProgramLogSize(prog, &log_size)); + + std::unique_ptr log{new char[log_size]}; + check(nvrtcGetProgramLog(prog, log.get())); + + if (log_size > 1) + { + std::cerr << log.get() << std::endl; + } + + const char* single_tile_kernel_lowered_name; + check(nvrtcGetLoweredName(prog, single_tile_kernel_name.c_str(), &single_tile_kernel_lowered_name)); + + const char* single_tile_second_kernel_lowered_name; + check(nvrtcGetLoweredName(prog, single_tile_second_kernel_name.c_str(), &single_tile_second_kernel_lowered_name)); + + const char* reduction_kernel_lowered_name; + check(nvrtcGetLoweredName(prog, reduction_kernel_name.c_str(), &reduction_kernel_lowered_name)); + + // Copy lowered names to a std::unique_ptr to ensure they can be used after + // the program is destroyed + + std::unique_ptr single_tile_kernel_lowered_name_ptr{new char[strlen(single_tile_kernel_lowered_name) + 1]}; + strcpy(single_tile_kernel_lowered_name_ptr.get(), single_tile_kernel_lowered_name); + + std::unique_ptr single_tile_second_kernel_lowered_name_ptr{ + new char[strlen(single_tile_second_kernel_lowered_name) + 1]}; + strcpy(single_tile_second_kernel_lowered_name_ptr.get(), single_tile_second_kernel_lowered_name); + + std::unique_ptr reduction_kernel_lowered_name_ptr{new char[strlen(reduction_kernel_lowered_name) + 1]}; + strcpy(reduction_kernel_lowered_name_ptr.get(), reduction_kernel_lowered_name); + + check(compile_result); + + std::size_t ltoir_size{}; + check(nvrtcGetLTOIRSize(prog, <oir_size)); + std::unique_ptr ltoir{new char[ltoir_size]}; + check(nvrtcGetLTOIR(prog, ltoir.get())); + check(nvrtcDestroyProgram(&prog)); + + nvJitLinkHandle handle; + const char* lopts[] = {"-lto", arch.c_str()}; + check(nvJitLinkCreate(&handle, 2, lopts)); + + check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, name)); + check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, op.ltoir, op.ltoir_size, name)); + + if (input_it.type == cccl_iterator_kind_t::iterator) + { + check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, input_it.advance.ltoir, input_it.advance.ltoir_size, name)); + check(nvJitLinkAddData( + handle, NVJITLINK_INPUT_LTOIR, input_it.dereference.ltoir, input_it.dereference.ltoir_size, name)); + } + + if (output_it.type == cccl_iterator_kind_t::iterator) + { + check( + nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, output_it.advance.ltoir, output_it.advance.ltoir_size, name)); + check(nvJitLinkAddData( + handle, NVJITLINK_INPUT_LTOIR, output_it.dereference.ltoir, output_it.dereference.ltoir_size, name)); + } + + check(nvJitLinkComplete(handle)); + + std::size_t cubin_size{}; + check(nvJitLinkGetLinkedCubinSize(handle, &cubin_size)); + std::unique_ptr cubin{new char[cubin_size]}; + check(nvJitLinkGetLinkedCubin(handle, cubin.get())); + check(nvJitLinkDestroy(&handle)); + + cuLibraryLoadData(&build->library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name_ptr.get())); + check(cuLibraryGetKernel( + &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name_ptr.get())); + check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name_ptr.get())); + + build->cc = cc; + build->cubin = cubin.release(); + build->cubin_size = cubin_size; + } + catch (...) + { + error = CUDA_ERROR_UNKNOWN; + } + + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_reduce( + cccl_device_reduce_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept +{ + bool pushed = false; + CUresult error = CUDA_SUCCESS; + try + { + pushed = try_push_context(); + + CUdevice cu_device; + check(cuCtxGetDevice(&cu_device)); + + Invoke( + d_temp_storage, + *temp_storage_bytes, + d_in, + d_out, + num_items, + op, + init, + build.cc, + build.single_tile_kernel, + build.single_tile_second_kernel, + build.reduction_kernel, + cu_device, + stream); + } + catch (...) + { + error = CUDA_ERROR_UNKNOWN; + } + + if (pushed) + { + CUcontext dummy; + cuCtxPopCurrent(&dummy); + } + + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) +{ + try + { + if (bld_ptr == nullptr) + { + return CUDA_ERROR_INVALID_VALUE; + } + + std::unique_ptr cubin(reinterpret_cast(bld_ptr->cubin)); + check(cuLibraryUnload(bld_ptr->library)); + } + catch (...) + { + return CUDA_ERROR_UNKNOWN; + } + + return CUDA_SUCCESS; +} diff --git a/c/test/CMakeLists.txt b/c/test/CMakeLists.txt new file mode 100644 index 0000000000..6a8599500e --- /dev/null +++ b/c/test/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable(cccl.c.test.reduce test_reduce.cpp test_main.cpp) + +target_link_libraries(cccl.c.test.reduce PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2) + +target_compile_definitions(cccl.c.test.reduce PRIVATE CCCL_C_EXPERIMENTAL + TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" + TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") diff --git a/c/test/c2h.h b/c/test/c2h.h new file mode 100644 index 0000000000..e2b26895a8 --- /dev/null +++ b/c/test/c2h.h @@ -0,0 +1,310 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static std::string inspect_sass(const void* cubin, size_t cubin_size) +{ + namespace fs = std::filesystem; + + fs::path temp_dir = fs::temp_directory_path(); + + fs::path temp_in_filename = temp_dir / "temp_in_file.cubin"; + fs::path temp_out_filename = temp_dir / "temp_out_file.sass"; + + std::ofstream temp_in_file(temp_in_filename, std::ios::binary); + if (!temp_in_file) + { + throw std::runtime_error("Failed to create temporary file."); + } + + temp_in_file.write(static_cast(cubin), cubin_size); + temp_in_file.close(); + + std::string command = "nvdisasm -gi "; + command += temp_in_filename; + command += " > "; + command += temp_out_filename; + + if (std::system(command.c_str()) != 0) + { + throw std::runtime_error("Failed to execute command."); + } + + if (!fs::remove(temp_in_filename)) + { + throw std::runtime_error("Failed to remove temporary file."); + } + + std::ifstream temp_out_file(temp_out_filename, std::ios::binary); + if (!temp_out_file) + { + throw std::runtime_error("Failed to create temporary file."); + } + + const std::string sass{std::istreambuf_iterator(temp_out_file), std::istreambuf_iterator()}; + if (!fs::remove(temp_out_filename)) + { + throw std::runtime_error("Failed to remove temporary file."); + } + + return sass; +} + +static std::string compile(const std::string& source) +{ + // compile source to LTO-IR using nvrtc + + nvrtcProgram prog; + REQUIRE(NVRTC_SUCCESS == nvrtcCreateProgram(&prog, source.c_str(), "op.cu", 0, nullptr, nullptr)); + + const char* options[] = {"--std=c++17", "-rdc=true", "-dlto"}; + REQUIRE(NVRTC_SUCCESS == nvrtcCompileProgram(prog, 3, options)); + + std::size_t ltoir_size{}; + REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIRSize(prog, <oir_size)); + + std::unique_ptr ltoir(new char[ltoir_size]); + + REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIR(prog, ltoir.get())); + REQUIRE(NVRTC_SUCCESS == nvrtcDestroyProgram(&prog)); + + return std::string(ltoir.release(), ltoir_size); +} + +template +std::vector generate(std::size_t num_items) +{ + std::random_device rnd_device; + std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers + std::uniform_int_distribution dist{T{1}, T{42}}; + std::vector vec(num_items); + std::generate(vec.begin(), vec.end(), [&]() { + return dist(mersenne_engine); + }); + return vec; +} + +template +cccl_type_info get_type_info() +{ + cccl_type_info info; + info.size = sizeof(T); + info.alignment = alignof(T); + + if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::INT8; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::INT32; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::UINT32; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::INT64; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::UINT64; + } + else if constexpr (!std::is_integral_v) + { + info.type = cccl_type_enum::STORAGE; + } + else + { + static_assert(false, "Unsupported type"); + } + + return info; +} + +static std::string get_op(cccl_type_enum t) +{ + switch (t) + { + case cccl_type_enum::INT8: + return "extern \"C\" __device__ char op(char a, char b) { return a + b; }"; + case cccl_type_enum::INT32: + return "extern \"C\" __device__ int op(int a, int b) { return a + b; }"; + case cccl_type_enum::UINT32: + return "extern \"C\" __device__ unsigned int op(unsigned int a, unsigned int b) { return a + b; }"; + case cccl_type_enum::INT64: + return "extern \"C\" __device__ long long op(long long a, long long b) { return a + b; }"; + case cccl_type_enum::UINT64: + return "extern \"C\" __device__ unsigned long long op(unsigned long long a, unsigned long long b) { " + " return a + b; " + "}"; + default: + throw std::runtime_error("Unsupported type"); + } + return ""; +} + +template +struct pointer_t +{ + T* ptr{}; + + pointer_t(int num_items) + { + REQUIRE(cudaSuccess == cudaMalloc(&ptr, num_items * sizeof(T))); + } + + pointer_t(const std::vector& vec) + { + REQUIRE(cudaSuccess == cudaMalloc(&ptr, vec.size() * sizeof(T))); + REQUIRE(cudaSuccess == cudaMemcpy(ptr, vec.data(), vec.size() * sizeof(T), cudaMemcpyHostToDevice)); + } + + ~pointer_t() + { + if (ptr) + { + REQUIRE(cudaSuccess == cudaFree(ptr)); + ptr = nullptr; + } + } + + T operator[](int i) const + { + T value{}; + REQUIRE(cudaSuccess == cudaMemcpy(&value, ptr + i, sizeof(T), cudaMemcpyDeviceToHost)); + return value; + } + + operator cccl_iterator_t() + { + cccl_iterator_t it; + it.size = sizeof(T); + it.alignment = alignof(T); + it.type = cccl_iterator_kind_t::pointer; + it.state = ptr; + it.value_type = get_type_info(); + return it; + } +}; + +struct operation_t +{ + std::string name; + std::string code; + + operator cccl_op_t() + { + cccl_op_t op; + op.type = cccl_op_kind_t::stateless; + op.name = name.c_str(); + op.ltoir = code.c_str(); + op.ltoir_size = code.size(); + return op; + } +}; + +template +struct stateful_operation_t +{ + OpT op_state; + std::string name; + std::string code; + + operator cccl_op_t() + { + cccl_op_t op; + op.type = cccl_op_kind_t::stateful; + op.size = sizeof(OpT); + op.alignment = alignof(OpT); + op.state = &op_state; + op.name = name.c_str(); + op.ltoir = code.c_str(); + op.ltoir_size = code.size(); + return op; + } +}; + +static operation_t make_operation(std::string name, std::string code) +{ + return operation_t{name, compile(code)}; +} + +template +static stateful_operation_t make_operation(std::string name, std::string code, OpT op) +{ + return {op, name, compile(code)}; +} + +template +struct iterator_t +{ + StateT state; + operation_t advance; + operation_t dereference; + + operator cccl_iterator_t() + { + cccl_iterator_t it; + it.size = sizeof(StateT); + it.alignment = alignof(StateT); + it.type = cccl_iterator_kind_t::iterator; + it.advance = advance; + it.dereference = dereference; + it.value_type = get_type_info(); + it.state = &state; + return it; + } +}; + +template +iterator_t make_iterator(std::string state, operation_t advance, operation_t dereference) +{ + iterator_t it; + it.advance = make_operation(advance.name, state + advance.code); + it.dereference = make_operation(dereference.name, state + dereference.code); + return it; +} + +template +struct value_t +{ + T value; + + value_t(T value) + : value(value) + {} + + operator cccl_value_t() + { + cccl_value_t v; + v.type = get_type_info(); + v.state = &value; + return v; + } +}; diff --git a/c/test/test_main.cpp b/c/test/test_main.cpp new file mode 100644 index 0000000000..3e3b4900a5 --- /dev/null +++ b/c/test/test_main.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +#define CATCH_CONFIG_RUNNER +#include + +int device_guard(int device_id) +{ + int device_count{}; + if (cudaGetDeviceCount(&device_count) != cudaSuccess) + { + std::cerr << "Can't query devices number." << std::endl; + std::exit(-1); + } + + if (device_id >= device_count || device_id < 0) + { + std::cerr << "Invalid device ID: " << device_id << std::endl; + std::exit(-1); + } + + return device_id; +} + +int main(int argc, char* argv[]) +{ + Catch::Session session; + + int device_id{}; + + // Build a new parser on top of Catch's + using namespace Catch::clara; + auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use"); + session.cli(cli); + + int returnCode = session.applyCommandLine(argc, argv); + if (returnCode != 0) + { + return returnCode; + } + + cudaSetDevice(device_guard(device_id)); + return session.run(argc, argv); +} diff --git a/c/test/test_reduce.cpp b/c/test/test_reduce.cpp new file mode 100644 index 0000000000..1a4607702a --- /dev/null +++ b/c/test/test_reduce.cpp @@ -0,0 +1,285 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include "c2h.h" + +void reduce(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init) +{ + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + const int cc_major = deviceProp.major; + const int cc_minor = deviceProp.minor; + + const char* cub_path = TEST_CUB_PATH; + const char* thrust_path = TEST_THRUST_PATH; + const char* libcudacxx_path = TEST_LIBCUDACXX_PATH; + const char* ctk_path = TEST_CTK_PATH; + + cccl_device_reduce_build_result_t build; + REQUIRE(CUDA_SUCCESS + == cccl_device_reduce_build( + &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path)); + + const std::string sass = inspect_sass(build.cubin, build.cubin_size); + REQUIRE(sass.find("LDL") == std::string::npos); + REQUIRE(sass.find("STL") == std::string::npos); + + size_t temp_storage_bytes = 0; + REQUIRE( + CUDA_SUCCESS == cccl_device_reduce(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + + pointer_t temp_storage(temp_storage_bytes); + + REQUIRE(CUDA_SUCCESS + == cccl_device_reduce(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + REQUIRE(CUDA_SUCCESS == cccl_device_reduce_cleanup(&build)); +} + +using integral_types = std::tuple; +TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types) +{ + const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + const std::vector input = generate(num_items); + pointer_t input_ptr(input); + pointer_t output_ptr(1); + value_t init{TestType{42}}; + + reduce(input_ptr, output_ptr, num_items, op, init); + + const TestType output = output_ptr[0]; + const TestType expected = std::accumulate(input.begin(), input.end(), init.value); + REQUIRE(output == expected); +} + +struct pair +{ + short a; + size_t b; +}; + +TEST_CASE("Reduce works with custom types", "[reduce]") +{ + const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + + operation_t op = make_operation( + "op", + "struct pair { short a; size_t b; };\n" + "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n" + " return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n" + "}"); + const std::vector a = generate(num_items); + const std::vector b = generate(num_items); + std::vector input(num_items); + for (std::size_t i = 0; i < num_items; ++i) + { + input[i] = pair{a[i], b[i]}; + } + pointer_t input_ptr(input); + pointer_t output_ptr(1); + value_t init{pair{4, 2}}; + + reduce(input_ptr, output_ptr, num_items, op, init); + + const pair output = output_ptr[0]; + const pair expected = std::accumulate(input.begin(), input.end(), init.value, [](const pair& lhs, const pair& rhs) { + return pair{short(lhs.a + rhs.a), lhs.b + rhs.b}; + }); + REQUIRE(output.a == expected.a); + REQUIRE(output.b == expected.b); +} + +struct counting_iterator_state_t +{ + int value; +}; + +TEST_CASE("Reduce works with input iterators", "[reduce]") +{ + const std::size_t num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t input_it = make_iterator( + "struct counting_iterator_state_t { int value; };\n", + {"advance", + "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n" + " state->value += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 0; + pointer_t output_it(1); + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const int output = output_it[0]; + const int expected = init.value + num_items * (num_items - 1) / 2; + REQUIRE(output == expected); +} + +struct transform_output_iterator_state_t +{ + int* d_output; +}; + +TEST_CASE("Reduce works with output iterators", "[reduce]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"advance", + "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + const std::vector input = generate(num_items); + pointer_t input_it(input); + pointer_t inner_output_it(1); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const int output = inner_output_it[0]; + const int expected = std::accumulate(input.begin(), input.end(), init.value); + REQUIRE(output == expected * 2); +} + +template +struct constant_iterator_state_t +{ + T value; +}; + +TEST_CASE("Reduce works with input and output iterators", "[reduce]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { int value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"out_advance", + "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"out_dereference", + "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + pointer_t inner_output_it(1); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const int output = inner_output_it[0]; + const int expected = 2 * (init.value + num_items); + REQUIRE(output == expected); +} + +TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]") +{ + const int num_items = 1 << 14; // 16384 > 128 + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { char value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + pointer_t output_it(1); + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const size_t output = output_it[0]; + const int expected = init.value + num_items; + REQUIRE(output == expected); +} + +TEST_CASE("Reduce works with large inputs", "[reduce]") +{ + const size_t num_items = 1ull << 33; + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { char value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + pointer_t output_it(1); + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const size_t output = output_it[0]; + const size_t expected = init.value + num_items; + REQUIRE(output == expected); +} + +struct invocation_counter_state_t +{ + int* d_counter; +}; + +TEST_CASE("Reduce works with stateful operators", "[reduce]") +{ + const int num_items = 1 << 12; + pointer_t counter(1); + stateful_operation_t op = make_operation( + "op", + "struct invocation_counter_state_t { int* d_counter; };\n" + "extern \"C\" __device__ int op(invocation_counter_state_t *state, int a, int b) {\n" + " atomicAdd(state->d_counter, 1);\n" + " return a + b;\n" + "}", + invocation_counter_state_t{counter.ptr}); + + const std::vector input = generate(num_items); + pointer_t input_ptr(input); + pointer_t output_ptr(1); + value_t init{42}; + + reduce(input_ptr, output_ptr, num_items, op, init); + + const int invocation_count = counter[0]; + const int expected_invocation_count = num_items - 1; + REQUIRE(invocation_count > expected_invocation_count); + + const int output = output_ptr[0]; + const int expected = std::accumulate(input.begin(), input.end(), init.value); + REQUIRE(output == expected); +} diff --git a/ci/build_common.sh b/ci/build_common.sh index e0bfc05c3b..1f5655d671 100755 --- a/ci/build_common.sh +++ b/ci/build_common.sh @@ -13,6 +13,7 @@ CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc` CUDA_ARCHS= # Empty, use presets by default. GLOBAL_CMAKE_OPTIONS=() DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks. +CONFIGURE_ONLY=false # Check if the correct number of arguments has been provided function usage { @@ -21,7 +22,8 @@ function usage { echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores." echo echo "Options:" - echo " -v/--verbose: enable shell echo for debugging" + echo " -v/-verbose: enable shell echo for debugging" + echo " -configure: Only run cmake to configure, do not build or test." echo " -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)" echo " -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)" echo " -std: CUDA/C++ standard (Defaults to 17)" @@ -32,6 +34,7 @@ function usage { echo " $ PARALLEL_LEVEL=8 $0" echo " $ PARALLEL_LEVEL=8 $0 -cxx g++-9" echo " $ $0 -cxx clang++-8" + echo " $ $0 -configure -arch=80" echo " $ $0 -cxx g++-8 -std 14 -arch 80-real -v -cuda /usr/local/bin/nvcc" echo " $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\"" exit 1 @@ -44,7 +47,8 @@ function usage { args=("$@") while [ "${#args[@]}" -ne 0 ]; do case "${args[0]}" in - -v | --verbose) VERBOSE=1; args=("${args[@]:1}");; + -v | --verbose | -verbose) VERBOSE=1; args=("${args[@]:1}");; + -configure) CONFIGURE_ONLY=true; args=("${args[@]:1}");; -cxx) HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");; -std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");; -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");; @@ -186,6 +190,16 @@ function configure_preset() run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE $CMAKE_OPTIONS "${GLOBAL_CMAKE_OPTIONS[@]}" status=$? popd > /dev/null + + if $CONFIGURE_ONLY; then + echo "${BUILD_NAME} configuration complete:" + echo " Exit code: ${status}" + echo " CMake Preset: ${PRESET}" + echo " CMake Options: ${CMAKE_OPTIONS}" + echo " Build Directory: ${BUILD_DIR}/${PRESET}" + exit $status + fi + return $status } @@ -196,6 +210,10 @@ function build_preset() { local red="1;31" local GROUP_NAME="🏗️ Build ${BUILD_NAME}" + if $CONFIGURE_ONLY; then + return 0 + fi + local preset_dir="${BUILD_DIR}/${PRESET}" local sccache_json="${preset_dir}/sccache_stats.json" @@ -239,6 +257,10 @@ function test_preset() local PRESET=$2 local GPU_REQUIRED=${3:-true} + if $CONFIGURE_ONLY; then + return 0 + fi + if $GPU_REQUIRED; then fail_if_no_gpu fi @@ -265,5 +287,8 @@ function configure_and_build_preset() local CMAKE_OPTIONS=$3 configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS" - build_preset "$BUILD_NAME" "$PRESET" + + if ! $CONFIGURE_ONLY; then + build_preset "$BUILD_NAME" "$PRESET" + fi } diff --git a/ci/build_cub.sh b/ci/build_cub.sh index 73236170a6..ce658bf66c 100755 --- a/ci/build_cub.sh +++ b/ci/build_cub.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -euo pipefail + source "$(dirname "$0")/build_common.sh" print_environment_details diff --git a/ci/build_cudax.sh b/ci/build_cudax.sh index 657372b191..2dff254972 100755 --- a/ci/build_cudax.sh +++ b/ci/build_cudax.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -euo pipefail + source "$(dirname "$0")/build_common.sh" print_environment_details diff --git a/ci/build_libcudacxx.sh b/ci/build_libcudacxx.sh index 1dc26f3228..1f6925b0fa 100755 --- a/ci/build_libcudacxx.sh +++ b/ci/build_libcudacxx.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -euo pipefail + source "$(dirname "$0")/build_common.sh" print_environment_details diff --git a/ci/build_thrust.sh b/ci/build_thrust.sh index 6e4a82da0f..6c9281c9c4 100755 --- a/ci/build_thrust.sh +++ b/ci/build_thrust.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -euo pipefail + source "$(dirname "$0")/build_common.sh" print_environment_details diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh index 22e9cf492d..72c37ba9c5 100755 --- a/ci/inspect_changes.sh +++ b/ci/inspect_changes.sh @@ -27,6 +27,7 @@ subprojects=( thrust cudax pycuda + c ) # ...and their dependencies: @@ -36,7 +37,8 @@ declare -A dependencies=( [cub]="cccl libcudacxx thrust" [thrust]="cccl libcudacxx cub" [cudax]="cccl libcudacxx" - [pycuda]="cccl libcudacxx cub thrust cudax" + [pycuda]="cccl libcudacxx cub thrust c" + [c]="cccl libcudacxx cub" ) declare -A project_names=( @@ -46,13 +48,14 @@ declare -A project_names=( [thrust]="Thrust" [cudax]="CUDA Experimental" [pycuda]="pycuda" + [c]="CUDA C Core Library " ) # By default, the project directory is assumed to be the same as the subproject name, # but can be overridden here. The `cccl` project is special, and checks for files outside # of any subproject directory. declare -A project_dirs=( - [pycuda]="python/cuda" + [pycuda]="python/cuda_cooperative" ) # Usage checks: diff --git a/ci/matrix.yaml b/ci/matrix.yaml index fa2f5d9218..e3102f8487 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -15,9 +15,9 @@ workflows: - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'} # Current CTK - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16']} + - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']} - - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang17', 'msvc2022']} + - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} # Modded builds: - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'} @@ -30,61 +30,39 @@ workflows: # cudax has different CTK reqs: - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11']} - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13']} - - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang14', 'clang15']} + - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], project: 'cudax', ctk: ['12.0', ], std: 20, cxx: ['msvc14.36']} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 20, cxx: ['msvc2022']} - {jobs: ['build'], project: 'cudax', ctk: ['12.0' ], std: 17, cxx: ['gcc12'], sm: "90"} - - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 17, cxx: ['gcc12'], sm: "90a"} - - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12', 'clang16'], cpu: 'arm64'} - - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 17, cxx: ['intel']} + - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 17, cxx: ['gcc13'], sm: "90a"} + - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'} - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']} - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']} - - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang16']} + - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']} # Python jobs: - - {jobs: ['test'], project: 'pycuda'} + - {jobs: ['test'], project: 'pycuda', ctk: ['12.5']} # cccl-infra: - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']} - - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} + - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} + # Edge-case jobs + - {jobs: ['limited'], project: 'cub', std: 17} nightly: - # libcudacxx build fails, CUB tests fail: - - {jobs: ['build'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11], project: ['cub']} - - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11], project: ['thrust']} - # - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11] } + - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11]} + - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17]} + - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]} + - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]} + - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'} + - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11]} + # H100 runners are currently flakey, only build since those use CPU-only runners: + - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} + - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]} - # libcudacxx build fails, CUB tests fail: - - {jobs: ['build'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17], project: ['cub']} - - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17], project: ['thrust']} - # - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17] } - - # CUB + libcudacxx tests fails: - - {jobs: ['build'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17], project: ['libcudacxx', 'cub']} - - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17], project: ['thrust']} - # - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17] } - - # libcudacxx tests fail: - - {jobs: ['build'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14], project: ['libcudacxx']} - - {jobs: ['build'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc12', std: 'all', project: ['libcudacxx']} - - {jobs: ['build'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11], project: ['libcudacxx']} - # H100 runners are currently flakey, only build since those use CPU-only runners: - - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} - - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang16', std: [17]} - - - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14], project: ['cub', 'thrust']} - - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc12', std: 'all', project: ['cub', 'thrust']} - - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11], project: ['cub', 'thrust']} - # - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14] } - # - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc12', std: 'all' } - # - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11] } - # - {jobs: ['test'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20] } - # - {jobs: ['test'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang16', std: [17] } - - # nvrtc: - - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4', sm: 'gpu', cxx: 'gcc12', std: [20], project: ['libcudacxx']} - - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc12', std: [20], project: ['libcudacxx']} - - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc12', std: 'all', project: ['libcudacxx']} - # Fails on h100: - # - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20], project: ['libcudacxx']} + # nvrtc: + - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} + - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} + - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all', project: ['libcudacxx']} + - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc13', std: [11, 20], project: ['libcudacxx']} # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows. exclude: @@ -108,7 +86,8 @@ ctk_versions: 11.1: { stds: [11, 14, 17, ] } 11.8: { stds: [11, 14, 17, ] } 12.0: { stds: [11, 14, 17, 20] } - 12.5: { stds: [11, 14, 17, 20], aka: 'curr' } + 12.5: { stds: [11, 14, 17, 20]} + 12.6: { stds: [11, 14, 17, 20], aka: 'curr' } device_compilers: nvcc: # Version / stds are taken from CTK @@ -146,6 +125,7 @@ host_compilers: 15: { stds: [11, 14, 17, 20] } 16: { stds: [11, 14, 17, 20] } 17: { stds: [11, 14, 17, 20] } + 18: { stds: [11, 14, 17, 20] } msvc: name: 'MSVC' container_tag: 'cl' @@ -200,6 +180,9 @@ jobs: test_lid1: { name: 'DeviceLaunch', gpu: true, needs: 'build', invoke: { prefix: 'test', args: '-lid1'} } # - captured in a CUDA graph for deferred launch (lid2): test_lid2: { name: 'GraphCapture', gpu: true, needs: 'build', invoke: { prefix: 'test', args: '-lid2'} } + # Limited build reduces the number of runtime test cases, available device memory, etc, and may be used + # to reduce test runtime in limited environments. + limited: { name: "SmallGMem", gpu: true, needs: 'build', invoke: { prefix: 'test', args: '-limited'} } # Thrust: test_cpu: { name: 'TestCPU', gpu: false, needs: 'build', invoke: { prefix: 'test', args: '-cpu-only'} } diff --git a/ci/pretty_printing.sh b/ci/pretty_printing.sh index 7e02468ee4..1f76ec3981 100644 --- a/ci/pretty_printing.sh +++ b/ci/pretty_printing.sh @@ -97,6 +97,10 @@ function print_time_summary() { fi done + if [ "$max_length" -eq 0 ]; then + return + fi + echo "Time Summary:" for group in "${!command_durations[@]}"; do printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}" diff --git a/ci/test_cub.sh b/ci/test_cub.sh index 9e036bd06f..59c97adfe2 100755 --- a/ci/test_cub.sh +++ b/ci/test_cub.sh @@ -6,10 +6,11 @@ NO_LID=false LID0=false LID1=false LID2=false +LIMITED=false ci_dir=$(dirname "$0") -new_args=$("${ci_dir}/util/extract_switches.sh" -no-lid -lid0 -lid1 -lid2 -- "$@") +new_args=$("${ci_dir}/util/extract_switches.sh" -no-lid -lid0 -lid1 -lid2 -limited -- "$@") eval set -- ${new_args} while true; do case "$1" in @@ -29,6 +30,10 @@ while true; do LID2=true shift ;; + -limited) + LIMITED=true + shift + ;; --) shift break @@ -40,6 +45,21 @@ while true; do esac done +if $LIMITED; then + + export CCCL_SEED_COUNT_OVERRIDE=1 + readonly device_mem_GiB=8 + export CCCL_DEVICE_MEMORY_LIMIT=$((${device_mem_GiB} * 1024 * 1024 * 1024)) + export CCCL_DEBUG_CHECKED_ALLOC_FAILURES=1 + + + echo "Configuring limited environment:" + echo " CCCL_SEED_COUNT_OVERRIDE=${CCCL_SEED_COUNT_OVERRIDE}" + echo " CCCL_DEVICE_MEMORY_LIMIT=${CCCL_DEVICE_MEMORY_LIMIT} (${device_mem_GiB} GiB)" + echo " CCCL_DEBUG_CHECKED_ALLOC_FAILURES=${CCCL_DEBUG_CHECKED_ALLOC_FAILURES}" + echo +fi + source "${ci_dir}/build_common.sh" print_environment_details diff --git a/ci/test_pycuda.sh b/ci/test_pycuda.sh index 742b22ec2a..bd66cc5771 100755 --- a/ci/test_pycuda.sh +++ b/ci/test_pycuda.sh @@ -11,10 +11,21 @@ fail_if_no_gpu readonly prefix="${BUILD_DIR}/python/" export PYTHONPATH="${prefix}:${PYTHONPATH:-}" -pushd ../python/cuda >/dev/null +pushd ../python/cuda_cooperative >/dev/null -run_command "⚙️ Pip install cuda" pip install --force-reinstall --target "${prefix}" .[test] -run_command "🚀 Pytest cuda" python -m pytest -v ./tests +run_command "⚙️ Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test] +run_command "🚀 Pytest cuda_cooperative" python -m pytest -v ./tests + +popd >/dev/null + +pushd ../python/cuda_parallel >/dev/null + +# Temporarily install the package twice to populate include directory as part of the first installation +# and to let manifest discover these includes during the second installation. Do not forget to remove the +# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed. +run_command "⚙️ Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test] +run_command "⚙️ Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test] +run_command "🚀 Pytest cuda_parallel" python -m pytest -v ./tests popd >/dev/null diff --git a/ci/update_version.sh b/ci/update_version.sh index febf963e17..9184b98e6a 100755 --- a/ci/update_version.sh +++ b/ci/update_version.sh @@ -36,7 +36,8 @@ CUB_CMAKE_VERSION_FILE="cub/cub/cmake/cub-config-version.cmake" LIBCUDACXX_CMAKE_VERSION_FILE="libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake" THRUST_CMAKE_VERSION_FILE="thrust/thrust/cmake/thrust-config-version.cmake" CUDAX_CMAKE_VERSION_FILE="cudax/lib/cmake/cudax/cudax-config-version.cmake" -PYCUDA_VERSION_FILE="python/cuda/cuda/cooperative/_version.py" +CUDA_COOPERATIVE_VERSION_FILE="python/cuda_cooperative/cuda/cooperative/_version.py" +CUDA_PARALLEL_VERSION_FILE="python/cuda_parallel/cuda/parallel/_version.py" # Calculated version codes new_cccl_version=$((major * 1000000 + minor * 1000 + patch)) # MMMmmmppp @@ -102,7 +103,8 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" " update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)" update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)" -update_file "$PYCUDA_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\"" +update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\"" +update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\"" if [ "$DRY_RUN" = true ]; then echo "Dry run completed. No changes made." diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt index 1c3102d0d7..3932fffea6 100644 --- a/cub/benchmarks/CMakeLists.txt +++ b/cub/benchmarks/CMakeLists.txt @@ -106,7 +106,7 @@ function(add_bench_dir bench_dir) add_bench(bench_target ${tuning_name} "${bench_src}") # for convenience, make tuning variant buildable by default file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n") - target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}") + target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}") else() # benchmarking register_cccl_benchmark("${bench_name}" "") diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu index f3b7ba3867..b6b9e4fd53 100644 --- a/cub/benchmarks/bench/radix_sort/keys.cu +++ b/cub/benchmarks/bench/radix_sort/keys.cu @@ -26,6 +26,7 @@ ******************************************************************************/ #include +#include #include @@ -123,7 +124,7 @@ constexpr std::size_t max_temp_storage_size() template constexpr bool fits_in_default_shared_memory() { - return max_temp_storage_size() < 48 * 1024; + return max_temp_storage_size() < cub::detail::max_smem_per_block; } #else // TUNE_BASE template diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu index 2729ce1b62..4a9f229bca 100644 --- a/cub/benchmarks/bench/radix_sort/pairs.cu +++ b/cub/benchmarks/bench/radix_sort/pairs.cu @@ -26,6 +26,7 @@ ******************************************************************************/ #include +#include #include @@ -121,7 +122,7 @@ constexpr std::size_t max_temp_storage_size() template constexpr bool fits_in_default_shared_memory() { - return max_temp_storage_size() < 48 * 1024; + return max_temp_storage_size() < cub::detail::max_smem_per_block; } #else // TUNE_BASE template diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu new file mode 100644 index 0000000000..a6c149ffdd --- /dev/null +++ b/cub/benchmarks/bench/reduce/min.cu @@ -0,0 +1,37 @@ +/****************************************************************************** + * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +// NOTE: this benchmark is intented to cover DPX instructions on Hopper+ architectures. +// It specifically uses cub::Min instead of a user-defined operator. +#define TUNE_T int16_t +#include + +// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 +// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 +// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 + +using op_t = cub::Min; +#include "base.cuh" diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 65b760fba2..e3cd7a7be8 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -27,6 +27,8 @@ #include +#include + #include #if !TUNE_BASE @@ -85,7 +87,7 @@ template static void basic(nvbench::state& state, nvbench::type_list) { using init_t = cub::detail::InputValue; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using input_it_t = const T*; using output_it_t = T*; using offset_t = OffsetT; @@ -129,7 +131,7 @@ static void basic(nvbench::state& state, nvbench::type_list) }); } -using some_offset_types = nvbench::type_list; +using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index 3830ad7764..26676d66c2 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -77,7 +77,7 @@ static void scan(nvbench::state& state, nvbench::type_list; + using accum_t = ::cuda::std::__accumulator_t; using key_input_it_t = const KeyT*; using val_input_it_t = const ValueT*; using val_output_it_t = ValueT*; diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu new file mode 100644 index 0000000000..02d2bc2ced --- /dev/null +++ b/cub/benchmarks/bench/select/unique.cu @@ -0,0 +1,150 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include +#include + +// %RANGE% TUNE_TRANSPOSE trp 0:1:1 +// %RANGE% TUNE_LOAD ld 0:1:1 +// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 +// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 +// %RANGE% TUNE_MAGIC_NS ns 0:2048:4 +// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 +// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 + +constexpr bool keep_rejects = false; + +#if !TUNE_BASE +# if TUNE_TRANSPOSE == 0 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT +# else // TUNE_TRANSPOSE == 1 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE +# endif // TUNE_TRANSPOSE + +# if TUNE_LOAD == 0 +# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT +# else // TUNE_LOAD == 1 +# define TUNE_LOAD_MODIFIER cub::LOAD_CA +# endif // TUNE_LOAD + +template +struct policy_hub_t +{ + struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + { + static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; + + static constexpr int ITEMS_PER_THREAD = + CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + + using SelectIfPolicyT = + cub::AgentSelectIfPolicy; + }; + + using MaxPolicy = policy_t; +}; +#endif // !TUNE_BASE + +template +static void unique(nvbench::state& state, nvbench::type_list) +{ + using input_it_t = const T*; + using flag_it_t = cub::NullType*; + using output_it_t = T*; + using num_selected_it_t = OffsetT*; + using select_op_t = cub::NullType; + using equality_op_t = cub::Equality; + using offset_t = OffsetT; + constexpr bool may_alias = InPlaceAlgT::value; + +#if !TUNE_BASE + using policy_t = policy_hub_t; + using dispatch_t = cub::DispatchSelectIf< + input_it_t, + flag_it_t, + output_it_t, + num_selected_it_t, + select_op_t, + equality_op_t, + offset_t, + keep_rejects, + may_alias, + policy_t>; +#else // TUNE_BASE + using dispatch_t = cub::DispatchSelectIf< + input_it_t, + flag_it_t, + output_it_t, + num_selected_it_t, + select_op_t, + equality_op_t, + offset_t, + keep_rejects, + may_alias>; +#endif // TUNE_BASE + + // Retrieve axis parameters + const auto elements = static_cast(state.get_int64("Elements{io}")); + constexpr std::size_t min_segment_size = 1; + const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); + + thrust::device_vector in = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); + thrust::device_vector out(elements); + thrust::device_vector num_unique_out(1); + + input_it_t d_in = thrust::raw_pointer_cast(in.data()); + output_it_t d_out = thrust::raw_pointer_cast(out.data()); + flag_it_t d_flags = nullptr; + num_selected_it_t d_num_unique = thrust::raw_pointer_cast(num_unique_out.data()); + + // Get temporary storage requirements + std::size_t temp_size{}; + dispatch_t::Dispatch( + nullptr, temp_size, d_in, d_flags, d_out, d_num_unique, select_op_t{}, equality_op_t{}, elements, 0); + + thrust::device_vector temp(temp_size); + auto* temp_storage = thrust::raw_pointer_cast(temp.data()); + + // Get number of unique elements + dispatch_t::Dispatch( + temp_storage, temp_size, d_in, d_flags, d_out, d_num_unique, select_op_t{}, equality_op_t{}, elements, 0); + + cudaDeviceSynchronize(); + const OffsetT num_unique = num_unique_out[0]; + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(num_unique); + state.add_global_memory_writes(1); + + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { + dispatch_t::Dispatch( + temp_storage, + temp_size, + d_in, + d_flags, + d_out, + d_num_unique, + select_op_t{}, + equality_op_t{}, + elements, + launch.get_stream()); + }); +} + +using in_place_alg = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>; + +NVBENCH_BENCH_TYPES(unique, NVBENCH_TYPE_AXES(fundamental_types, offset_types, in_place_alg)) + .set_name("base") + .set_type_axes_names({"T{ct}", "OffsetT{ct}", "IsInPlace{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); diff --git a/cub/benchmarks/bench/transform/babelstream.h b/cub/benchmarks/bench/transform/babelstream.h new file mode 100644 index 0000000000..0f482d59e2 --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream.h @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +#include + +#include + +#include + +template +#if TUNE_BASE +using policy_hub_t = cub::detail::transform::policy_hub>; +#else +struct policy_hub_t +{ + struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy> + { + static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__); + static constexpr auto algorithm = static_cast(TUNE_ALGORITHM); + using algo_policy = + ::cuda::std::_If>; + }; +}; +#endif + +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = + nvbench::type_list; +#endif + +// BabelStream uses 2^25, H200 can fit 2^31 int128s +// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80) +// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108) +// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132) +// inline auto array_size_powers = std::vector{28}; +inline auto array_size_powers = nvbench::range(16, 28, 4); + +template +void bench_transform( + nvbench::state& state, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + OffsetT num_items, + TransformOp transform_op, + ExecTag exec_tag = nvbench::exec_tag::no_batch) +{ + state.exec(exec_tag, [&](const nvbench::launch& launch) { + cub::detail::transform::dispatch_t< + false, + OffsetT, + ::cuda::std::tuple, + RandomAccessIteratorOut, + TransformOp, + policy_hub_t>::dispatch(inputs, output, num_items, transform_op, launch.get_stream()); + }); +} + +// Modified from BabelStream to also work for integers +inline constexpr auto startA = 1; // BabelStream: 0.1 +inline constexpr auto startB = 2; // BabelStream: 0.2 +inline constexpr auto startC = 3; // BabelStream: 0.1 +inline constexpr auto startScalar = 4; // BabelStream: 0.4 + +// TODO(bgruber): we should put those somewhere into libcu++: +// from C++ GSL +struct narrowing_error : std::runtime_error +{ + narrowing_error() + : std::runtime_error("Narrowing error") + {} +}; + +// from C++ GSL +// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow +template ::value, int> = 0> +constexpr DstT narrow(SrcT value) +{ + constexpr bool is_different_signedness = ::cuda::std::is_signed::value != ::cuda::std::is_signed::value; + const auto converted = static_cast(value); + if (static_cast(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{})))) + { + throw narrowing_error{}; + } + return converted; +} diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu new file mode 100644 index 0000000000..87abdfef6f --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream1.cu @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "babelstream.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +template +static void mul(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + const T scalar = startScalar; + bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) { + return ci * scalar; + }); +} + +NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("mul") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu new file mode 100644 index 0000000000..c8fa017b78 --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream2.cu @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "babelstream.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +template +static void add(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform( + state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T { + return ai + bi; + }); +} + +NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("add") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +template +static void triad(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + const T scalar = startScalar; + bench_transform( + state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) { + return bi + scalar * ci; + }); +} + +NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("triad") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu new file mode 100644 index 0000000000..db54155421 --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream3.cu @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "babelstream.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +template +static void nstream(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + const auto overwrite = static_cast(state.get_int64("OverwriteInput")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + // The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes + // the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to + // choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input. + thrust::device_vector d; + if (!overwrite) + { + d.resize(n); + } + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); + state.add_global_memory_writes(n); + const T scalar = startScalar; + bench_transform( + state, + ::cuda::std::tuple{a.begin(), b.begin(), c.begin()}, + overwrite ? a.begin() : d.begin(), + n, + [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) { + return ai + bi + scalar * ci; + }, + nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when + // OverwriteInput=0, but doesn't hurt +} + +NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("nstream") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers) + .add_int64_axis("OverwriteInput", {1}); diff --git a/cub/cmake/header_test.in b/cub/cmake/header_test.in index 547c2030ab..300fa6abb9 100644 --- a/cub/cmake/header_test.in +++ b/cub/cmake/header_test.in @@ -5,7 +5,7 @@ // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating // a potential macro collision and halts. // -// Use raw platform checks instead of the CUB_HOST_COMPILER macros since we +// Use raw platform macros instead of the CCCL macros since we // don't want to #include any headers other than the one being tested. // // This is only implemented for MSVC/GCC/Clang. diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index ce204273da..f324de52bc 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -287,22 +287,22 @@ struct AgentHistogram SampleT* d_native_samples; /// The number of output bins for each channel - int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; + int* num_output_bins; /// The number of privatized bins for each channel - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; + int* num_privatized_bins; - /// Reference to gmem privatized histograms for each channel + /// Copy of gmem privatized histograms for each channel CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; /// Reference to final output histograms (gmem) - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; + CounterT** d_output_histograms; /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT* output_decode_op; /// The transform operator for determining privatized counter indices from samples, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT* privatized_decode_op; /// Whether to prefer privatized smem counters vs privatized global counters bool prefer_smem; @@ -810,12 +810,12 @@ struct AgentHistogram _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram( TempStorage& temp_storage, SampleIteratorT d_samples, - int (&num_output_bins)[NUM_ACTIVE_CHANNELS], - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], - CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) + int* num_output_bins, + int* num_privatized_bins, + CounterT** d_output_histograms, + CounterT** d_privatized_histograms, + OutputDecodeOpT* output_decode_op, + PrivatizedDecodeOpT* privatized_decode_op) : temp_storage(temp_storage.Alias()) , d_wrapped_samples(d_samples) , d_native_samples(NativePointer(d_wrapped_samples)) diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh index 709e9c1bd0..5bc3bae321 100644 --- a/cub/cub/block/block_adjacent_difference.cuh +++ b/cub/cub/block/block_adjacent_difference.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockAdjacentDifference class provides collective methods for computing -//! the differences of adjacent elements partitioned across a CUDA thread block. +//! @file +//! The cub::BlockAdjacentDifference class provides collective methods for computing the differences of adjacent +//! elements partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index 76c073f1b5..284ac4401e 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -26,7 +26,8 @@ * ******************************************************************************/ -//! @file block_load.cuh Operations for reading linear tiles of data into the CUDA thread block. +//! @file +//! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block. #pragma once diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index c91731ae03..21a4879192 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -26,7 +26,8 @@ * ******************************************************************************/ -//! @file cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block +//! @file +//! cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block #pragma once diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index d35c90c06d..12c97ee5b8 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockReduce class provides :ref:`collective ` methods for computing -//! a parallel reduction of items partitioned across a CUDA thread block. +//! @file +//! The cub::BlockReduce class provides :ref:`collective ` methods for computing a parallel +//! reduction of items partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index df7ab6e814..afc4df76d7 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockScan class provides :ref:`collective ` methods for computing a -//! parallel prefix sum/scan of items partitioned across a CUDA thread block. +//! @file +//! The cub::BlockScan class provides :ref:`collective ` methods for computing a parallel prefix +//! sum/scan of items partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index 048c6e3a8e..a3dedcc3c7 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockShuffle class provides :ref:`collective ` methods for shuffling -//! data partitioned across a CUDA thread block. +//! @file +//! The cub::BlockShuffle class provides :ref:`collective ` methods for shuffling data +//! partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index 6c9f4f57a8..9d057d7fe4 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -26,7 +26,8 @@ * ******************************************************************************/ -//! @file Operations for writing linear segments of data from the CUDA thread block +//! @file +//! Operations for writing linear segments of data from the CUDA thread block #pragma once diff --git a/cub/cub/cmake/cub-config-version.cmake b/cub/cub/cmake/cub-config-version.cmake index 2a12c4fa2b..86cdca2275 100644 --- a/cub/cub/cmake/cub-config-version.cmake +++ b/cub/cub/cmake/cub-config-version.cmake @@ -2,7 +2,7 @@ include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake") set(CUB_VERSION_MAJOR 2) -set(CUB_VERSION_MINOR 6) +set(CUB_VERSION_MINOR 7) set(CUB_VERSION_PATCH 0) set(CUB_VERSION_TWEAK 0) set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}") diff --git a/cub/cub/config.cuh b/cub/cub/config.cuh index f7f25ddef0..123f2df46b 100644 --- a/cub/cub/config.cuh +++ b/cub/cub/config.cuh @@ -33,7 +33,7 @@ #pragma once // For _CCCL_IMPLICIT_SYSTEM_HEADER -#include +#include // IWYU pragma: export #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header @@ -43,9 +43,9 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include // IWYU pragma: export +#include // IWYU pragma: export +#include // IWYU pragma: export +#include // IWYU pragma: export diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh index f02ae6c002..2c4d6dd5f4 100644 --- a/cub/cub/cub.cuh +++ b/cub/cub/cub.cuh @@ -76,6 +76,7 @@ #include #include #include +#include // Grid // #include diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index 10d40cacd1..12dce69c13 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -50,6 +50,8 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH _CCCL_SUPPRESS_DEPRECATED_POP #include +#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::__enable_if_t<(__VA_ARGS__)>* = nullptr + CUB_NAMESPACE_BEGIN namespace detail { @@ -62,9 +64,101 @@ using invoke_result_t = ::cuda::std::invoke_result_t; #endif -/// The type of intermediate accumulator (according to P2322R6) -template -using accumulator_t = typename ::cuda::std::decay>::type; +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same() +{ + return ::cuda::std::conjunction<::cuda::std::is_same...>::value; +} + +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of() +{ + return ::cuda::std::disjunction<::cuda::std::is_same...>::value; +} + +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false() +{ + return false; +} + +template +struct has_binary_call_operator : ::cuda::std::false_type +{}; + +template +struct has_binary_call_operator< + T, + V, + ::cuda::std::void_t()(::cuda::std::declval(), ::cuda::std::declval()))>> + : ::cuda::std::true_type +{}; + +/*********************************************************************************************************************** + * Array like type traits + **********************************************************************************************************************/ + +template +struct has_subscript : ::cuda::std::false_type +{}; + +template +struct has_subscript()[0])>> : ::cuda::std::true_type +{}; + +template +using has_subscript_t = typename has_subscript::type; + +template +struct has_size : ::cuda::std::false_type +{}; + +// TODO: use ::cuda::std::size(::cuda::std::declval()) when std::size will be available in libcu++ +template +struct has_size().size())>> : ::cuda::std::true_type +{}; + +template +struct has_size : ::cuda::std::true_type +{}; + +template +using has_size_t = typename has_size::type; + +/*********************************************************************************************************************** + * StaticSize: a type trait that returns the number of elements in an Array-like type + **********************************************************************************************************************/ +// StaticSize is useful where size(obj) cannot be checked at compile time +// e.g. +// using Array = NonTriviallyConstructible[8]; +// std::size(Array{}) // compile error +// static_size() // ok + +template +struct StaticSize +{ + static_assert(detail::always_false(), "StaticSize not supported for this type"); +}; + +template +struct StaticSize().size()>{})>> +{ + static_assert(::cuda::std::is_trivially_constructible::value, "T must be trivially constructible"); + static constexpr auto value = T{}.size(); +}; + +template +struct StaticSize +{ + static constexpr auto value = N; +}; + +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::size_t static_size() +{ + return StaticSize::value; +} } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh index 0d222475b2..a6d24a5221 100644 --- a/cub/cub/device/device_copy.cuh +++ b/cub/cub/device/device_copy.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file cub::DeviceCopy provides device-wide, parallel operations for copying data. +//! @file +//! cub::DeviceCopy provides device-wide, parallel operations for copying data. #pragma once diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 46f4bee557..e6abc4bd07 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceHistogram provides device-wide parallel operations for -//! constructing histogram(s) from a sequence of samples data residing -//! within device-accessible memory. +//! @file +//! cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of +//! samples data residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh index 1359863a76..e71431cb74 100644 --- a/cub/cub/device/device_memcpy.cuh +++ b/cub/cub/device/device_memcpy.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file cub::DeviceMemcpy provides device-wide, parallel operations for copying data. +//! @file +//! cub::DeviceMemcpy provides device-wide, parallel operations for copying data. #pragma once diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 08a2ae531f..28bfc377bd 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file cub::DevicePartition provides device-wide, parallel operations for -//! partitioning sequences of data items residing within device-accessible memory. +//! @file +//! cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing +//! within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh index c653badc47..a14c5e4364 100644 --- a/cub/cub/device/device_radix_sort.cuh +++ b/cub/cub/device/device_radix_sort.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceRadixSort provides device-wide, parallel operations for -//! computing a radix sort across a sequence of data items residing within -//! device-accessible memory. +//! @file +//! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data +//! items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index a31e641920..4b02129123 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceReduce provides device-wide, parallel operations for -//! computing a reduction across a sequence of data items residing within -//! device-accessible memory. +//! @file +//! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data +//! items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index f3b1a3e669..120562a461 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -26,14 +26,16 @@ * ******************************************************************************/ -//! @file cub::DeviceRunLengthEncode provides device-wide, parallel operations -//! for computing a run-length encoding across a sequence of data items -//! residing within device-accessible memory. +//! @file +//! cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a +//! sequence of data items residing within device-accessible memory. #pragma once #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -200,7 +202,7 @@ struct DeviceRunLengthEncode // Generator type for providing 1s values for run-length reduction using lengths_input_iterator_t = ConstantInputIterator; - using accum_t = detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using key_t = cub::detail::non_void_value_t>; diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 29f3cf6c1e..27882e9cee 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across -//! a sequence of data items residing within device-accessible memory. +//! @file +//! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data +//! items residing within device-accessible memory. #pragma once @@ -41,12 +42,15 @@ # pragma system_header #endif // no system header +#include #include #include #include #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -152,6 +156,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -172,19 +179,19 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using InitT = cub::detail::value_t; // Initial value @@ -195,13 +202,13 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -261,6 +268,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -278,20 +288,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -377,6 +387,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -403,7 +416,7 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -411,13 +424,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -431,7 +444,7 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -439,7 +452,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -520,6 +533,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -543,28 +559,28 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -655,6 +671,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -685,7 +704,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -693,13 +713,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -717,7 +737,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -725,7 +746,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -809,6 +830,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -832,28 +856,36 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -923,6 +955,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -943,32 +978,32 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1027,6 +1062,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -1044,20 +1082,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1137,6 +1175,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1161,20 +1202,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); @@ -1220,6 +1261,9 @@ struct DeviceScan //! @tparam InitValueT //! **[inferred]** Type of the `init_value` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1246,7 +1290,7 @@ struct DeviceScan //! //! @param[in] stream //! CUDA stream to launch kernels within. - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1254,14 +1298,14 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit"); - // Signed integer type for global offsets - using OffsetT = int; - using AccumT = cub::detail::accumulator_t>; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; + using AccumT = ::cuda::std::__accumulator_t, InitValueT>; constexpr bool ForceInclusive = true; return DispatchScan< @@ -1283,14 +1327,14 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1364,6 +1408,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1385,26 +1432,26 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index eb6eecdcf3..cc627b971c 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort -//! across multiple, non-overlapping sequences of data items residing within device-accessible memory. +//! @file +//! cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across +//! multiple, non-overlapping sequences of data items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 6a0875734e..ec5d017fc2 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSegmentedReduce provides device-wide, parallel operations -//! for computing a batched reduction across multiple sequences of data -//! items residing within device-accessible memory. +//! @file +//! cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across +//! multiple sequences of data items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 67a22c5e54..7d01b6d56a 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -25,9 +25,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSegmentedSort provides device-wide, parallel operations for -//! computing a batched sort across multiple, non-overlapping sequences of -//! data items residing within device-accessible memory. +//! @file +//! cub::DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple, +//! non-overlapping sequences of data items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 703a912829..332bbe6c7d 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSelect provides device-wide, parallel operations for -//! compacting selected items from sequences of data items residing within -//! device-accessible memory. +//! @file +//! cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data +//! items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh index 32ac433f3e..8b7e60d435 100644 --- a/cub/cub/device/device_spmv.cuh +++ b/cub/cub/device/device_spmv.cuh @@ -27,8 +27,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSpmv provides device-wide parallel operations for performing -//! sparse-matrix * vector multiplication (SpMV). +//! @file +//! cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication +//! (SpMV). #pragma once diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh new file mode 100644 index 0000000000..984109692f --- /dev/null +++ b/cub/cub/device/device_transform.cuh @@ -0,0 +1,271 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +CUB_NAMESPACE_BEGIN + +//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input +//! sequences into an output sequence. +struct DeviceTransform +{ + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity + //! (i.e. address) of the objects passed to the call operator of the transformation operation. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin transform-many + //! :end-before: example-end transform-many + //! + //! @endrst + //! + //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The + //! iterators' value types must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform"); + return detail::transform:: + dispatch_t, RandomAccessIteratorOut, TransformOp>:: + dispatch( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB + // APIs. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + void* d_temp_storage, + size_t& temp_storage_bytes, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return Transform( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @rst + //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity + //! (i.e. address) of the objects passed to the call operator of the transformation operation. + //! @endrst + //! + //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type + //! must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + return Transform( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB + // APIs. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return Transform( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. The objects passed to the call operator + //! of the transformation operation are guaranteed to reside in the input sequences and are never copied. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin transform-many-stable + //! :end-before: example-end transform-many-stable + //! + //! @endrst + //! + //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The + //! iterators' value types must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses"); + return detail::transform:: + dispatch_t, RandomAccessIteratorOut, TransformOp>:: + dispatch( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + void* d_temp_storage, + size_t& temp_storage_bytes, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return TransformStableArgumentAddresses( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @rst + //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. The objects passed to the call operator + //! of the transformation operation are guaranteed to reside in the input sequences and are never copied. + //! @endrst + //! + //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type + //! must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + return TransformStableArgumentAddresses( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return TransformStableArgumentAddresses( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS +}; + +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index 7a657d6703..4af6864b03 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -38,6 +38,7 @@ #endif // no system header #include +#include #include #include #include @@ -56,106 +57,6 @@ namespace detail namespace for_each { -template -struct first_parameter -{ - using type = void; -}; - -template -struct first_parameter -{ - using type = A; -}; - -template -struct first_parameter -{ - using type = A; -}; - -template -using first_parameter_t = typename first_parameter::type; - -template -struct has_unique_value_overload : ::cuda::std::false_type -{}; - -// clang-format off -template -struct has_unique_value_overload< - Value, - Fn, - typename ::cuda::std::enable_if< - !::cuda::std::is_reference>::value && - ::cuda::std::is_convertible - >::value>::type> - : ::cuda::std::true_type -{}; - -// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable. -// This can be observable if the unary operator takes parameter by reference and modifies it or uses address. -// The trait below checks if the freedom to copy trivial types can be regained. -template -using can_regain_copy_freedom = - ::cuda::std::integral_constant< - bool, - ::cuda::std::is_trivially_constructible::value && - ::cuda::std::is_trivially_copy_assignable::value && - :: cuda::std::is_trivially_move_assignable::value && - ::cuda::std::is_trivially_destructible::value && - has_unique_value_overload::value>; -// clang-format on - -// This kernel is used when the block size is not known at compile time -template -CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op) -{ - using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; - using agent_t = agent_block_striped_t; - - const auto block_threads = static_cast(blockDim.x); - const auto items_per_tile = active_policy_t::items_per_thread * block_threads; - const auto tile_base = static_cast(blockIdx.x) * items_per_tile; - const auto num_remaining = num_items - tile_base; - const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); - - if (items_in_tile == items_per_tile) - { - agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); - } - else - { - agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); - } -} - -// This kernel is used when the block size is known at compile time -template -CUB_DETAIL_KERNEL_ATTRIBUTES // -__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) // - void static_kernel(OffsetT num_items, OpT op) -{ - using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; - using agent_t = agent_block_striped_t; - - constexpr auto block_threads = active_policy_t::block_threads; - constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads; - - const auto tile_base = static_cast(blockIdx.x) * items_per_tile; - const auto num_remaining = num_items - tile_base; - const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); - - if (items_in_tile == items_per_tile) - { - agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); - } - else - { - agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); - } -} - // The dispatch layer is in the detail namespace until we figure out tuning API template struct dispatch_t : PolicyHubT diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index aa8cc2f5c0..1839385b19 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -238,12 +238,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK AgentHistogramT agent( temp_storage, d_samples, - num_output_bins_wrapper.__elems_, - num_privatized_bins_wrapper.__elems_, - d_output_histograms_wrapper.__elems_, - d_privatized_histograms_wrapper.__elems_, - output_decode_op_wrapper.__elems_, - privatized_decode_op_wrapper.__elems_); + num_output_bins_wrapper.data(), + num_privatized_bins_wrapper.data(), + d_output_histograms_wrapper.data(), + d_privatized_histograms_wrapper.data(), + output_decode_op_wrapper.data(), + privatized_decode_op_wrapper.data()); // Initialize counters agent.InitBinCounters(); @@ -847,7 +847,7 @@ public: { // GCC 14 rightfully warns that when a value-initialized array of this struct is copied using memcpy, uninitialized // bytes may be accessed. To avoid this, we add a dummy member, so value initialization actually initializes the memory. -#if defined(_CCCL_COMPILER_GCC) && __GNUC__ == 14 +#if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 13 char dummy; #endif diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index e3e3844a3f..ba5365c618 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -316,7 +316,7 @@ template >, - typename AccumT = detail::accumulator_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, typename SelectedPolicy = DeviceReducePolicy, typename TransformOpT = ::cuda::std::__identity> struct DispatchReduce : SelectedPolicy @@ -797,17 +797,16 @@ struct DispatchReduce : SelectedPolicy * @tparam InitT * Initial value type */ -template >>, - typename SelectedPolicyT = DeviceReducePolicy> +template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetT, + typename ReductionOpT, + typename TransformOpT, + typename InitT, + typename AccumT = ::cuda::std:: + __accumulator_t>, InitT>, + typename SelectedPolicyT = DeviceReducePolicy> using DispatchTransformReduce = DispatchReduce; @@ -850,7 +849,7 @@ template >, - typename AccumT = detail::accumulator_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, typename SelectedPolicy = DeviceReducePolicy> struct DispatchSegmentedReduce : SelectedPolicy { diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 07dd492a53..8ae232e8d1 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -213,25 +213,25 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ -template < - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename EqualityOpT, - typename ReductionOpT, - typename OffsetT, - typename AccumT = // - detail:: - accumulator_t, cub::detail::value_t>, - typename SelectedPolicy = // - detail::device_reduce_by_key_policy_hub< // - ReductionOpT, // - AccumT, // - cub::detail::non_void_value_t< // - UniqueOutputIteratorT, // - cub::detail::value_t>>> +template , + cub::detail::value_t>, + typename SelectedPolicy = // + detail::device_reduce_by_key_policy_hub< // + ReductionOpT, // + AccumT, // + cub::detail::non_void_value_t< // + UniqueOutputIteratorT, // + cub::detail::value_t>>> struct DispatchReduceByKey { //------------------------------------------------------------------------- diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 56c2be9611..7d2fc4ac17 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -234,11 +234,11 @@ template ::value, - cub::detail::value_t, - typename InitValueT::value_type>, - cub::detail::value_t>, + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If::value, + cub::detail::value_t, + typename InitValueT::value_type>>, typename SelectedPolicy = DeviceScanPolicy, bool ForceInclusive = false> struct DispatchScan : SelectedPolicy diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 032554773a..5dfffa5e77 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -228,10 +228,10 @@ template < typename ScanOpT, typename InitValueT, typename OffsetT, - typename AccumT = detail::accumulator_t< + typename AccumT = ::cuda::std::__accumulator_t< ScanOpT, - ::cuda::std::_If::value, cub::detail::value_t, InitValueT>, - cub::detail::value_t>, + cub::detail::value_t, + ::cuda::std::_If::value, cub::detail::value_t, InitValueT>>, typename SelectedPolicy = DeviceScanByKeyPolicy, ScanOpT>> struct DispatchScanByKey : SelectedPolicy diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh new file mode 100644 index 0000000000..8fb596da07 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -0,0 +1,866 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000 +_CCCL_NV_DIAG_SUPPRESS(186) +# include +// we cannot re-enable the warning here, because it is triggered outside the translation unit +// see also: https://godbolt.org/z/1x8b4hn3G +#endif // defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000 + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +CUB_NAMESPACE_BEGIN + +// the ublkcp kernel needs PTX features that are only available and understood by CTK 12 and later +#if _CCCL_CUDACC_VER_MAJOR >= 12 +# define _CUB_HAS_TRANSFORM_UBLKCP +#endif // _CCCL_CUDACC_VER_MAJOR >= 12 + +namespace detail +{ +namespace transform +{ +_CCCL_HOST_DEVICE constexpr int sum() +{ + return 0; +} + +// TODO(bgruber): remove with C++17 +template +_CCCL_HOST_DEVICE constexpr int sum(int head, Ts... tail) +{ + return head + sum(tail...); +} + +#if _CCCL_STD_VER >= 2017 +template +_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int +{ + return (int{sizeof(value_t)} + ... + 0); +} +#else // ^^^ C++17 ^^^ / vvv C++11 vvv +template +_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int +{ + return sum(int{sizeof(value_t)}...); +} +#endif // _CCCL_STD_VER >= 2017 + +enum class Algorithm +{ + fallback_for, +#ifdef _CUB_HAS_TRANSFORM_UBLKCP + ublkcp, +#endif // _CUB_HAS_TRANSFORM_UBLKCP +}; + +// this kernel replicates the behavior of cub::DeviceFor::Bulk +template +_CCCL_DEVICE void transform_kernel_impl( + ::cuda::std::integral_constant, + Offset num_items, + int /* items_per_thread */, + F transform_op, + RandomAccessIteratorOut out, + RandomAccessIteratorsIn... ins) +{ + auto op = [&](Offset i) { + out[i] = transform_op(ins[i]...); + }; + using OpT = decltype(op); + + // TODO(bgruber): verbatim copy from for_each's static_kernel below: + using agent_t = for_each::agent_block_striped_t; + + constexpr auto block_threads = ForPolicy::block_threads; + constexpr auto items_per_tile = ForPolicy::items_per_thread * block_threads; + + const auto tile_base = static_cast(blockIdx.x) * items_per_tile; + const auto num_remaining = num_items - tile_base; + const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); + + if (items_in_tile == items_per_tile) + { + agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); + } + else + { + agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); + } +} + +template +struct async_copy_policy_t +{ + static constexpr int block_threads = BlockThreads; + // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy + static constexpr int min_items_per_thread = 1; + static constexpr int max_items_per_thread = 32; +}; + +// TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17. +template +_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence) + -> decltype(::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...)) +{ + return ::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...); +} + +template +_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) + -> decltype(poor_apply_impl( + ::cuda::std::forward(f), + ::cuda::std::forward(t), + ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t>::value>{})) +{ + return poor_apply_impl( + ::cuda::std::forward(f), + ::cuda::std::forward(t), + ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t>::value>{}); +} + +// mult must be a power of 2 +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Integral x, Integral mult) -> Integral +{ +#if _CCCL_STD_VER > 2011 + _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t>(mult)), ""); +#endif // _CCCL_STD_VER > 2011 + return (x + mult - 1) & ~(mult - 1); +} + +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment) +{ +#if _CCCL_STD_VER > 2011 + _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(alignment), ""); +#endif // _CCCL_STD_VER > 2011 + return reinterpret_cast( + reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1}); +} + +// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding +// +// For performance considerations of memcpy_async: +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async +// +// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we +// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower +// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to +// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we +// have to align to 128 bytes instead of 16. +// +// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts: +// * According to the CUDA programming guide +// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable +// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is +// always aligned to at least 256 bytes." +// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on +// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer. +// * This should count for device memory, but also for device accessible memory living on the host. +// * The base pointer alignment and size rounding also never leaves the size of a cache line. +// +// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is +// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a +// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end. + +// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some +// conditions. +// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring +// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of +// the element type. + +// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels. +// cg::memcpy_async is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes +// UBLKCP is most efficient when the data is 128-byte aligned and the size a multiple of 16 bytes +template // Cannot add alignment to signature, because we need a uniform kernel template instantiation +struct aligned_base_ptr +{ + using value_type = T; + + const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to + // alignof(T). E.g.: array of int3 starting at address 4, ptr == 0 + int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127]. + + _CCCL_HOST_DEVICE const T* ptr_to_elements() const + { + return reinterpret_cast(ptr + head_padding); + } + + _CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b) + { + return a.ptr == b.ptr && a.head_padding == b.head_padding; + } +}; + +template +_CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr +{ + const char* base_ptr = round_down_ptr(ptr, alignment); + return aligned_base_ptr{base_ptr, static_cast(reinterpret_cast(ptr) - base_ptr)}; +} + +constexpr int bulk_copy_alignment = 128; +constexpr int bulk_copy_size_multiple = 16; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP +_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one() +{ + const ::cuda::std::uint32_t membermask = ~0; + ::cuda::std::uint32_t is_elected; + asm volatile( + "{\n\t .reg .pred P_OUT; \n\t" + "elect.sync _|P_OUT, %1;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(is_elected) + : "r"(membermask) + :); + return threadIdx.x < 32 && static_cast(is_elected); +} + +// TODO(bgruber): inline this as lambda in C++14 +template +_CCCL_DEVICE void bulk_copy_tile( + ::cuda::std::uint64_t& bar, + int tile_stride, + char* smem, + int& smem_offset, + ::cuda::std::uint32_t& total_bytes_bulk_copied, + Offset global_offset, + const aligned_base_ptr& aligned_ptr) +{ + static_assert(alignof(T) <= bulk_copy_alignment, ""); + + const char* src = aligned_ptr.ptr + global_offset * sizeof(T); + char* dst = smem + smem_offset; + _LIBCUDACXX_ASSERT(reinterpret_cast(src) % bulk_copy_alignment == 0, ""); + _LIBCUDACXX_ASSERT(reinterpret_cast(dst) % bulk_copy_alignment == 0, ""); + + // TODO(bgruber): we could precompute bytes_to_copy on the host + const int bytes_to_copy = round_up_to_po2_multiple( + aligned_ptr.head_padding + static_cast(sizeof(T)) * tile_stride, bulk_copy_size_multiple); + + ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar); + total_bytes_bulk_copied += bytes_to_copy; + + // add bulk_copy_alignment to make space for the next tile's head padding + smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; +} + +template +_CCCL_DEVICE void bulk_copy_tile_fallback( + int tile_size, + int tile_stride, + char* smem, + int& smem_offset, + Offset global_offset, + const aligned_base_ptr& aligned_ptr) +{ + const T* src = aligned_ptr.ptr_to_elements() + global_offset; + T* dst = reinterpret_cast(smem + smem_offset + aligned_ptr.head_padding); + _LIBCUDACXX_ASSERT(reinterpret_cast(src) % alignof(T) == 0, ""); + _LIBCUDACXX_ASSERT(reinterpret_cast(dst) % alignof(T) == 0, ""); + + const int bytes_to_copy = static_cast(sizeof(T)) * tile_size; + cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy); + + // add bulk_copy_alignment to make space for the next tile's head padding + smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; +} + +// TODO(bgruber): inline this as lambda in C++14 +template +_CCCL_DEVICE _CCCL_FORCEINLINE const T& +fetch_operand(int tile_stride, const char* smem, int& smem_offset, int smem_idx, const aligned_base_ptr& aligned_ptr) +{ + const T* smem_operand_tile_base = reinterpret_cast(smem + smem_offset + aligned_ptr.head_padding); + smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment; + return smem_operand_tile_base[smem_idx]; +} + +template +_CCCL_DEVICE void transform_kernel_ublkcp( + Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr... aligned_ptrs) +{ + __shared__ uint64_t bar; + extern __shared__ char __align__(bulk_copy_alignment) smem[]; + + namespace ptx = ::cuda::ptx; + + constexpr int block_dim = BulkCopyPolicy::block_threads; + const int tile_stride = block_dim * num_elem_per_thread; + const Offset offset = static_cast(blockIdx.x) * tile_stride; + const int tile_size = ::cuda::std::min(num_items - offset, Offset{tile_stride}); + + const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x; + if (inner_blocks) + { + // use one thread to setup the entire bulk copy + if (elect_one()) + { + ptx::mbarrier_init(&bar, 1); + ptx::fence_proxy_async(ptx::space_shared); + + int smem_offset = 0; + ::cuda::std::uint32_t total_copied = 0; + + // TODO(bgruber): use a fold over comma in C++17 + // Order of evaluation is left-to-right + int dummy[] = {(bulk_copy_tile(bar, tile_stride, smem, smem_offset, total_copied, offset, aligned_ptrs), 0)..., + 0}; + (void) dummy; + + // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet + ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied); + } + + // all threads wait for bulk copy + __syncthreads(); + while (!ptx::mbarrier_try_wait_parity(&bar, 0)) + ; + } + else + { + // use all threads to schedule an async_memcpy + int smem_offset = 0; + + // TODO(bgruber): use a fold over comma in C++17 + // Order of evaluation is left-to-right + int dummy[] = {(bulk_copy_tile_fallback(tile_size, tile_stride, smem, smem_offset, offset, aligned_ptrs), 0)..., 0}; + (void) dummy; + + cooperative_groups::wait(cooperative_groups::this_thread_block()); + } + + // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below + out += offset; + + // note: I tried expressing the UBLKCP_AGENT as a function object but it adds a lot of code to handle the variadics + // TODO(bgruber): use a polymorphic lambda in C++14 +# define UBLKCP_AGENT(full_tile) \ + _Pragma("unroll 1") /* Unroll 1 tends to improve performance, especially for smaller data types (confirmed by \ + benchmark) */ \ + for (int j = 0; j < num_elem_per_thread; ++j) \ + { \ + const int idx = j * block_dim + threadIdx.x; \ + if (full_tile || idx < tile_size) \ + { \ + int smem_offset = 0; \ + /* need to expand into a tuple for guaranteed order of evaluation*/ \ + out[idx] = poor_apply( \ + [&](const InTs&... values) { \ + return f(values...); \ + }, \ + ::cuda::std::tuple{fetch_operand(tile_stride, smem, smem_offset, idx, aligned_ptrs)...}); \ + } \ + } + if (tile_stride == tile_size) + { + UBLKCP_AGENT(true); + } + else + { + UBLKCP_AGENT(false); + } +# undef UBLKCP_AGENT +} + +template +_CCCL_DEVICE void transform_kernel_impl( + ::cuda::std::integral_constant, + Offset num_items, + int num_elem_per_thread, + F f, + RandomAccessIteratorOut out, + aligned_base_ptr... aligned_ptrs) +{ + // only call the real kernel for sm90 and later + NV_IF_TARGET(NV_PROVIDES_SM_90, + (transform_kernel_ublkcp(num_items, num_elem_per_thread, f, out, aligned_ptrs...);)); +} +#endif // _CUB_HAS_TRANSFORM_UBLKCP + +template +union kernel_arg +{ + aligned_base_ptr> aligned_ptr; + It iterator; + + _CCCL_HOST_DEVICE kernel_arg() {} // in case It is not default-constructible +}; + +template +_CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg +{ + kernel_arg arg; + arg.iterator = it; + return arg; +} + +template +_CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -> kernel_arg +{ + kernel_arg arg; + arg.aligned_ptr = make_aligned_base_ptr(ptr, alignment); + return arg; +} + +// TODO(bgruber): make a variable template in C++14 +template +using needs_aligned_ptr_t = + ::cuda::std::bool_constant; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP +template ::value, int> = 0> +_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg( + ::cuda::std::integral_constant, kernel_arg&& arg) -> aligned_base_ptr>&& +{ + return ::cuda::std::move(arg.aligned_ptr); +} +#endif // _CUB_HAS_TRANSFORM_UBLKCP + +template ::value, int> = 0> +_CCCL_DEVICE _CCCL_FORCEINLINE auto +select_kernel_arg(::cuda::std::integral_constant, kernel_arg&& arg) -> It&& +{ + return ::cuda::std::move(arg.iterator); +} + +// There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated +// with the same arguments for each algorithm. Only the device compiler will then select the implementation. This +// saves some compile-time and binary size. +template +__launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads) + CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( + Offset num_items, + int num_elem_per_thread, + F f, + RandomAccessIteratorOut out, + kernel_arg... ins) +{ + constexpr auto alg = ::cuda::std::integral_constant{}; + transform_kernel_impl( + alg, + num_items, + num_elem_per_thread, + ::cuda::std::move(f), + ::cuda::std::move(out), + select_kernel_arg(alg, ::cuda::std::move(ins))...); +} + +constexpr int arch_to_min_bytes_in_flight(int sm_arch) +{ + // TODO(bgruber): use if-else in C++14 for better readability + return sm_arch >= 900 ? 48 * 1024 // 32 for H100, 48 for H200 + : sm_arch >= 800 ? 16 * 1024 // A100 + : 12 * 1024; // V100 and below +} + +template +_CCCL_HOST_DEVICE constexpr auto bulk_copy_smem_for_tile_size(int tile_size) -> int +{ + return round_up_to_po2_multiple(int{sizeof(int64_t)}, bulk_copy_alignment) /* bar */ + // 128 bytes of padding for each input tile (handles before + after) + + tile_size * loaded_bytes_per_iteration() + + sizeof...(RandomAccessIteratorsIn) * bulk_copy_alignment; +} + +using fallback_for_policy = for_each::policy_hub_t::policy_350_t::for_policy_t; + +template +struct policy_hub +{ + static_assert(sizeof(RandomAccessIteratorTupleIn) == 0, "Second parameter must be a tuple"); +}; + +template +struct policy_hub> +{ + static constexpr bool no_input_streams = sizeof...(RandomAccessIteratorsIn) == 0; + static constexpr bool all_contiguous = + ::cuda::std::conjunction...>::value; + static constexpr bool all_values_trivially_reloc = + ::cuda::std::conjunction>...>::value; + + static constexpr bool can_memcpy = all_contiguous && all_values_trivially_reloc; + + // TODO(bgruber): consider a separate kernel for just filling + + struct policy300 : ChainedPolicy<300, policy300, policy300> + { + static constexpr int min_bif = arch_to_min_bytes_in_flight(300); + // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy + static constexpr auto algorithm = Algorithm::fallback_for; + using algo_policy = fallback_for_policy; + }; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP + // H100 and H200 + struct policy900 : ChainedPolicy<900, policy900, policy300> + { + static constexpr int min_bif = arch_to_min_bytes_in_flight(900); + using async_policy = async_copy_policy_t<256>; + static constexpr bool exhaust_smem = + bulk_copy_smem_for_tile_size( + async_policy::block_threads * async_policy::min_items_per_thread) + > 48 * 1024; + static constexpr bool any_type_is_overalinged = +# if _CCCL_STD_VER >= 2017 + ((alignof(value_t) > bulk_copy_alignment) || ...); +# else + sum((alignof(value_t) > bulk_copy_alignment)...) > 0; +# endif + + static constexpr bool use_fallback = + RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged; + static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp; + using algo_policy = ::cuda::std::_If; + }; + + using max_policy = policy900; +#else // _CUB_HAS_TRANSFORM_UBLKCP + using max_policy = policy300; +#endif // _CUB_HAS_TRANSFORM_UBLKCP +}; + +// TODO(bgruber): replace by ::cuda::std::expected in C++14 +template +struct PoorExpected +{ + alignas(T) char storage[sizeof(T)]; + cudaError_t error; + + _CCCL_HOST_DEVICE PoorExpected(T value) + : error(cudaSuccess) + { + new (storage) T(::cuda::std::move(value)); + } + + _CCCL_HOST_DEVICE PoorExpected(cudaError_t error) + : error(error) + {} + + _CCCL_HOST_DEVICE explicit operator bool() const + { + return error == cudaSuccess; + } + + _CCCL_HOST_DEVICE T& operator*() + { + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing") + return reinterpret_cast(storage); + _CCCL_DIAG_POP + } + + _CCCL_HOST_DEVICE const T& operator*() const + { + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing") + return reinterpret_cast(storage); + _CCCL_DIAG_POP + } + + _CCCL_HOST_DEVICE T* operator->() + { + return &**this; + } + + _CCCL_HOST_DEVICE const T* operator->() const + { + return &**this; + } +}; + +// TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this. +_CCCL_HOST_DEVICE inline PoorExpected get_max_shared_memory() +{ + // gevtushenko promised me that I can assume that the stream passed to the CUB API entry point (where the kernels + // will later be launched on) belongs to the currently active device. So we can just query the active device here. + int device = 0; + auto error = CubDebug(cudaGetDevice(&device)); + if (error != cudaSuccess) + { + return error; + } + + int max_smem = 0; + error = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device)); + if (error != cudaSuccess) + { + return error; + } + + return max_smem; +} + +struct elem_counts +{ + int elem_per_thread; + int tile_size; + int smem_size; +}; + +template > +struct dispatch_t; + +template +struct dispatch_t, + RandomAccessIteratorOut, + TransformOp, + PolicyHub> +{ + static_assert(::cuda::std::is_same::value + || ::cuda::std::is_same::value, + "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types"); + + ::cuda::std::tuple in; + RandomAccessIteratorOut out; + Offset num_items; + TransformOp op; + cudaStream_t stream; + +#define CUB_DETAIL_TRANSFORM_KERNEL_PTR \ + &transform_kernel...> + + static constexpr int loaded_bytes_per_iter = loaded_bytes_per_iteration(); + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP + // TODO(bgruber): I want to write tests for this but those are highly depending on the architecture we are running + // on? + template + CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() + -> PoorExpected< + ::cuda::std:: + tuple> + { + using policy_t = typename ActivePolicy::algo_policy; + constexpr int block_dim = policy_t::block_threads; + static_assert(block_dim % bulk_copy_alignment == 0, + "block_threads needs to be a multiple of bulk_copy_alignment (128)"); // then tile_size is a multiple + // of 128-byte + + auto determine_element_counts = [&]() -> PoorExpected { + const auto max_smem = get_max_shared_memory(); + if (!max_smem) + { + return max_smem.error; + } + + elem_counts last_counts{}; + // Increase the number of output elements per thread until we reach the required bytes in flight. + static_assert(policy_t::min_items_per_thread <= policy_t::max_items_per_thread, ""); // ensures the loop below + // runs at least once + for (int elem_per_thread = +policy_t::min_items_per_thread; elem_per_thread < +policy_t::max_items_per_thread; + ++elem_per_thread) + { + const int tile_size = block_dim * elem_per_thread; + const int smem_size = bulk_copy_smem_for_tile_size(tile_size); + if (smem_size > *max_smem) + { +# ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + // assert should be prevented by smem check in policy + assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory"); +# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + return last_counts; + } + + if (tile_size >= num_items) + { + return elem_counts{elem_per_thread, tile_size, smem_size}; + } + + int max_occupancy = 0; + const auto error = + CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, smem_size)); + if (error != cudaSuccess) + { + return error; + } + + const int bytes_in_flight_SM = max_occupancy * tile_size * loaded_bytes_per_iter; + if (ActivePolicy::min_bif <= bytes_in_flight_SM) + { + return elem_counts{elem_per_thread, tile_size, smem_size}; + } + + last_counts = elem_counts{elem_per_thread, tile_size, smem_size}; + } + return last_counts; + }; + PoorExpected config = [&]() { + NV_IF_TARGET( + NV_IS_HOST, + ( + // this static variable exists for each template instantiation of the surrounding function and class, on which + // the chosen element count solely depends (assuming max SMEM is constant during a program execution) + static auto cached_config = determine_element_counts(); return cached_config;), + ( + // we cannot cache the determined element count in device code + return determine_element_counts();)); + }(); + if (!config) + { + return config.error; + } +# ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + assert(config->elem_per_thread > 0); + assert(config->tile_size > 0); + assert(config->tile_size % bulk_copy_alignment == 0); + assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor +# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + + const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{config->tile_size})); + return ::cuda::std::make_tuple( + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, config->smem_size, stream), + CUB_DETAIL_TRANSFORM_KERNEL_PTR, + config->elem_per_thread); + } + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t + invoke_algorithm(cuda::std::index_sequence, ::cuda::std::integral_constant) + { + auto ret = configure_ublkcp_kernel(); + if (!ret) + { + return ret.error; + } + // TODO(bgruber): use a structured binding in C++17 + // auto [launcher, kernel, elem_per_thread] = *ret; + + return ::cuda::std::get<0>(*ret).doit( + ::cuda::std::get<1>(*ret), + num_items, + ::cuda::std::get<2>(*ret), + op, + out, + make_aligned_base_ptr_kernel_arg( + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)), bulk_copy_alignment)...); + } +#endif // _CUB_HAS_TRANSFORM_UBLKCP + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t + invoke_algorithm(cuda::std::index_sequence, ::cuda::std::integral_constant) + { + constexpr int block_threads = ActivePolicy::algo_policy::block_threads; + constexpr int items_per_thread = ActivePolicy::algo_policy::items_per_thread; + constexpr int tile_size = block_threads * items_per_thread; + const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{tile_size})); + return CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_threads, 0, stream) + .doit( + CUB_DETAIL_TRANSFORM_KERNEL_PTR, + num_items, + items_per_thread, + op, + out, + make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)))...)); + } + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() + { + // // TODO(bgruber): replace the overload set by if constexpr in C++17 + return invoke_algorithm(::cuda::std::index_sequence_for{}, + ::cuda::std::integral_constant{}); + } + + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( + ::cuda::std::tuple in, + RandomAccessIteratorOut out, + Offset num_items, + TransformOp op, + cudaStream_t stream) + { + if (num_items == 0) + { + return cudaSuccess; + } + + int ptx_version = 0; + auto error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + return error; + } + + dispatch_t dispatch{::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(op), stream}; + return CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch)); + } + +#undef CUB_DETAIL_TRANSFORM_KERNEL_PTR +}; +} // namespace transform +} // namespace detail +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/kernels/for_each.cuh b/cub/cub/device/dispatch/kernels/for_each.cuh new file mode 100644 index 0000000000..2213252d2f --- /dev/null +++ b/cub/cub/device/dispatch/kernels/for_each.cuh @@ -0,0 +1,154 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +CUB_NAMESPACE_BEGIN + +namespace detail +{ +namespace for_each +{ + +template +struct first_parameter +{ + using type = void; +}; + +template +struct first_parameter +{ + using type = A; +}; + +template +struct first_parameter +{ + using type = A; +}; + +template +using first_parameter_t = typename first_parameter::type; + +template +struct has_unique_value_overload : ::cuda::std::false_type +{}; + +// clang-format off +template +struct has_unique_value_overload< + Value, + Fn, + typename ::cuda::std::enable_if< + !::cuda::std::is_reference>::value && + ::cuda::std::is_convertible + >::value>::type> + : ::cuda::std::true_type +{}; + +// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable. +// This can be observable if the unary operator takes parameter by reference and modifies it or uses address. +// The trait below checks if the freedom to copy trivial types can be regained. +template +using can_regain_copy_freedom = + ::cuda::std::integral_constant< + bool, + ::cuda::std::is_trivially_constructible::value && + ::cuda::std::is_trivially_copy_assignable::value && + :: cuda::std::is_trivially_move_assignable::value && + ::cuda::std::is_trivially_destructible::value && + has_unique_value_overload::value>; +// clang-format on + +// This kernel is used when the block size is not known at compile time +template +CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op) +{ + using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; + using agent_t = agent_block_striped_t; + + const auto block_threads = static_cast(blockDim.x); + const auto items_per_tile = active_policy_t::items_per_thread * block_threads; + const auto tile_base = static_cast(blockIdx.x) * items_per_tile; + const auto num_remaining = num_items - tile_base; + const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); + + if (items_in_tile == items_per_tile) + { + agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); + } + else + { + agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); + } +} + +// This kernel is used when the block size is known at compile time +template +CUB_DETAIL_KERNEL_ATTRIBUTES // +__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) // + void static_kernel(OffsetT num_items, OpT op) +{ + using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; + using agent_t = agent_block_striped_t; + + constexpr auto block_threads = active_policy_t::block_threads; + constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads; + + const auto tile_base = static_cast(blockIdx.x) * items_per_tile; + const auto num_remaining = num_items - tile_base; + const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); + + if (items_in_tile == items_per_tile) + { + agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); + } + else + { + agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); + } +} + +} // namespace for_each +} // namespace detail + +CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 21ed8592d6..4df4b49ac0 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -47,14 +47,15 @@ # pragma system_header #endif // no system header +#include // always_false #include #include -_CCCL_SUPPRESS_DEPRECATED_PUSH -#include -_CCCL_SUPPRESS_DEPRECATED_POP -#include -#include +#include // cuda::std::plus +#include // cuda::std::common_type +#include // cuda::std::forward + +// #include // std::plus CUB_NAMESPACE_BEGIN @@ -413,4 +414,121 @@ _CCCL_HOST_DEVICE BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) return BinaryFlip(binary_op); } +namespace internal +{ +// TODO: Remove DPX specilization when nvbug 4823237 is fixed + +template +struct DpxMin +{ + static_assert(detail::always_false(), "DpxMin is not supported for this type"); +}; + +template <> +struct DpxMin<::cuda::std::int16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vmins2(a, b); + } +}; + +template <> +struct DpxMin<::cuda::std::uint16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vminu2(a, b); + } +}; + +//---------------------------------------------------------------------------------------------------------------------- + +template +struct DpxMax +{ + static_assert(detail::always_false(), "DpxMax is not supported for this type"); +}; + +template <> +struct DpxMax<::cuda::std::int16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vmaxs2(a, b); + } +}; + +template <> +struct DpxMax<::cuda::std::uint16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vmaxu2(a, b); + } +}; + +//---------------------------------------------------------------------------------------------------------------------- + +template +struct DpxSum +{ + static_assert(detail::always_false(), "DpxSum is not supported for this type"); +}; + +template <> +struct DpxSum<::cuda::std::int16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vadd2(a, b); + } +}; + +template <> +struct DpxSum<::cuda::std::uint16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vadd2(a, b); + } +}; + +//---------------------------------------------------------------------------------------------------------------------- + +template +struct CubOperatorToDpx +{ + static_assert(detail::always_false(), "Dpx is not supported for this operator"); +}; + +template +struct CubOperatorToDpx +{ + using type = DpxMin; +}; + +template +struct CubOperatorToDpx +{ + using type = DpxMax; +}; + +template +struct CubOperatorToDpx +{ + using type = DpxSum; +}; + +// template +// struct CubOperatorToDpx, T> +//{ +// using type = DpxSum; +// }; + +template +using cub_operator_to_dpx_t = CubOperatorToDpx; + +} // namespace internal + CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index db7e242779..a956321f78 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -28,7 +28,7 @@ /** * @file - * Thread utilities for sequential reduction over statically-sized array types + * Thread reduction over statically-sized array-like types */ #pragma once @@ -43,8 +43,17 @@ # pragma system_header #endif // no system header -#include -#include +#include // are_same() +#include // cub_operator_to_dpx_t +#include +#include + +#include // bit_cast +#include // uint16_t +#include // cuda::std::plus +#include // pair + +// #include // std::plus CUB_NAMESPACE_BEGIN @@ -52,51 +61,143 @@ CUB_NAMESPACE_BEGIN namespace internal { -/** - * @brief Sequential reduction over statically-sized array types - * - * @param[in] input - * Input array - * - * @param[in] reduction_op - * Binary reduction operator - * - * @param[in] prefix - * Prefix to seed reduction with - */ -template > -_CCCL_DEVICE _CCCL_FORCEINLINE AccumT -ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type /*length*/) +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/// DPX instructions compute min, max, and sum for up to three 16 and 32-bit signed or unsigned integer parameters +/// see DPX documetation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dpx +/// NOTE: The compiler is able to automatically vectorize all cases with 3 operands +/// However, all other cases with per-halfword comparison need to be explicitly vectorized +/// TODO: Remove DPX specilization when nvbug 4823237 is fixed +/// +/// DPX reduction is enabled if the following conditions are met: +/// - Hopper+ architectures. DPX instructions are emulated before Hopper +/// - The number of elements must be large enough for performance reasons (see below) +/// - All types must be the same +/// - Only works with integral types of 2 bytes +/// - DPX instructions provide Min, Max, and Sum SIMD operations +/// If the number of instructions is the same, we favor the compiler + +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // clang-format off +constexpr bool enable_dpx_reduction() { - AccumT retval = prefix; + using T = decltype(::cuda::std::declval()[0]); + // TODO: use constexpr variable in C++14+ + using Lenght = ::cuda::std::integral_constant()>; + return ((Lenght{} >= 9 && detail::are_same*/>()) || Lenght{} >= 10) + && detail::are_same() + && detail::is_one_of() + && detail::is_one_of*/>(); +} +// clang-format on -#pragma unroll - for (int i = 0; i < LENGTH; ++i) +// Considering compiler vectorization with 3-way comparison, the number of SASS instructions is +// Standard: ceil((L - 3) / 2) + 1 +// replacing L with L/2 for SIMD +// DPX: ceil((L/2 - 3) / 2) + 1 + 2 [for halfword comparison: PRMT, VIMNMX] + L % 2 [for last element] +// finally, the last two comparision operations are vectorized in a 3-way reduction +// ceil((L/2 - 3) / 2) + 3 +// +// length | Standard | DPX +// 2 | 1 | NA +// 3 | 1 | NA +// 4 | 2 | 3 +// 5 | 2 | 3 +// 6 | 3 | 3 +// 7 | 3 | 3 +// 8 | 4 | 4 +// 9 | 4 | 4 +// 10 | 5 | 4 // *** +// 11 | 5 | 4 // *** +// 12 | 6 | 5 // *** +// 13 | 6 | 5 // *** +// 14 | 7 | 5 // *** +// 15 | 7 | 5 // *** +// 16 | 8 | 6 // *** + +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT +ThreadReduceSequential(const Input& input, ReductionOp reduction_op) +{ + AccumT retval = input[0]; +# pragma unroll + for (int i = 1; i < detail::static_size(); ++i) { retval = reduction_op(retval, input[i]); } - return retval; } +/// Specialization for DPX reduction +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto +ThreadReduceDpx(const Input& input, ReductionOp reduction_op) -> ::cuda::std::__remove_cvref_t +{ + using T = ::cuda::std::__remove_cvref_t; + constexpr int length = detail::static_size(); + T array[length]; +# pragma unroll + for (int i = 0; i < length; ++i) + { + array[i] = input[i]; + } + using DpxReduceOp = cub_operator_to_dpx_t; + using SimdType = ::cuda::std::pair; + auto unsigned_input = reinterpret_cast(array); + auto simd_reduction = ThreadReduceSequential(unsigned_input, DpxReduceOp{}); + auto simd_values = ::cuda::std::bit_cast(simd_reduction); + auto ret_value = reduction_op(simd_values.first, simd_values.second); + return (length % 2 == 0) ? ret_value : reduction_op(ret_value, input[length - 1]); +} + +// DPX/Sequential dispatch +template ()[0])>, + typename AccumT = ::cuda::std::__accumulator_t, + _CUB_TEMPLATE_REQUIRES(enable_dpx_reduction())> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op) +{ + static_assert(sizeof(Input) != sizeof(Input), "a"); + static_assert(detail::has_subscript::value, "Input must support the subscript operator[]"); + static_assert(detail::has_size::value, "Input must have the size() method"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); + NV_IF_TARGET(NV_PROVIDES_SM_90, + (return ThreadReduceDpx(input, reduction_op);), + (return ThreadReduceSequential(input, reduction_op);)) +} + +template ()[0])>, + typename AccumT = ::cuda::std::__accumulator_t, + _CUB_TEMPLATE_REQUIRES(!enable_dpx_reduction())> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op) +{ + static_assert(detail::has_subscript::value, "Input must support the subscript operator[]"); + static_assert(detail::has_size::value, "Input must have the size() method"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); + return ThreadReduceSequential(input, reduction_op); +} + +#endif // !DOXYGEN_SHOULD_SKIP_THIS + /** - * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array, - * seeded with the specified @p prefix. The aggregate is returned. + * @brief Reduction over statically-sized array-like types, seeded with the specified @p prefix. * - * @tparam LENGTH - * LengthT of input array - * - * @tparam T - * [inferred] The data type to be reduced. + * @tparam Input + * [inferred] The data type to be reduced having member + * operator[](int i) and must be statically-sized (size() method or static array) * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * + * @tparam PrefixT + * [inferred] The prefix type + * * @param[in] input * Input array * @@ -105,101 +206,122 @@ ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Typecuda::std::__accumulator_t */ -template > -_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix) +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + typename ValueT = ::cuda::std::__remove_cvref_t()[0])>, +#endif // !DOXYGEN_SHOULD_SKIP_THIS + typename AccumT = ::cuda::std::__accumulator_t> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT +ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix) { - return ThreadReduce(input, reduction_op, prefix, Int2Type()); + static_assert(detail::has_subscript::value, "Input must support the subscript operator[]"); + static_assert(detail::has_size::value, "Input must have the size() method"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); + constexpr int length = detail::static_size(); + // copy to a temporary array of type AccumT + AccumT array[length + 1]; + array[0] = prefix; +#pragma unroll + for (int i = 0; i < length; ++i) + { + array[i + 1] = input[i]; + } + return ThreadReduce(array, reduction_op); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + /** - * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array. - * The aggregate is returned. + * @remark The pointer interface adds little value and requires Length to be explicit. + * Prefer using the array-like interface * - * @tparam LENGTH - * LengthT of input array + * @brief Perform a sequential reduction over @p length elements of the @p input pointer. The aggregate is returned. * * @tparam T - * [inferred] The data type to be reduced. + * [inferred] The data type to be reduced * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * * @param[in] input - * Input array + * Input pointer * * @param[in] reduction_op * Binary reduction operator + * + * @return Aggregate of type cuda::std::__accumulator_t */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T* input, ReductionOp reduction_op) +template > +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const T* input, ReductionOp reduction_op) { - T prefix = input[0]; - return ThreadReduce(input + 1, reduction_op, prefix); + static_assert(Length > 0, "Length must be greater than 0"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(V1, V2)"); + using ArrayT = T[Length]; + auto array = reinterpret_cast(input); + return ThreadReduce(*array, reduction_op); } /** - * @brief Perform a sequential reduction over the statically-sized @p input array, - * seeded with the specified @p prefix. The aggregate is returned. + * @remark The pointer interface adds little value and requires Length to be explicit. + * Prefer using the array-like interface + * + * @brief Perform a sequential reduction over @p length elements of the @p input pointer, seeded with the specified @p + * prefix. The aggregate is returned. * - * @tparam LENGTH - * [inferred] LengthT of @p input array + * @tparam length + * Length of input pointer * * @tparam T - * [inferred] The data type to be reduced. + * [inferred] The data type to be reduced * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * + * @tparam PrefixT + * [inferred] The prefix type + * * @param[in] input - * Input array + * Input pointer * * @param[in] reduction_op * Binary reduction operator * * @param[in] prefix * Prefix to seed reduction with + * + * @return Aggregate of type cuda::std::__accumulator_t */ -template > -_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix) + typename AccumT = ::cuda::std::__accumulator_t, + _CUB_TEMPLATE_REQUIRES(Length > 0)> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT +ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix) { - return ThreadReduce(input, reduction_op, prefix, Int2Type()); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(V1, V2)"); + auto array = reinterpret_cast(input); + return ThreadReduce(*array, reduction_op, prefix); } -/** - * @brief Serial reduction with the specified operator - * - * @tparam LENGTH - * [inferred] LengthT of @p input array - * - * @tparam T - * [inferred] The data type to be reduced. - * - * @tparam ReductionOp - * [inferred] Binary reduction operator type having member - * T operator()(const T &a, const T &b) - * - * @param[in] input - * Input array - * - * @param[in] reduction_op - * Binary reduction operator - */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op) +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, ReductionOp, PrefixT prefix) { - return ThreadReduce((T*) input, reduction_op); + return prefix; } +#endif // !DOXYGEN_SHOULD_SKIP_THIS + } // namespace internal CUB_NAMESPACE_END diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index b9ef4f1f6c..5f8780620f 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -43,7 +43,7 @@ # pragma system_header #endif // no system header -#include +#include // IWYU pragma: export #include #include @@ -136,13 +136,21 @@ static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); # define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0) # endif +namespace detail +{ +// The maximum amount of static shared memory available per thread block +// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB +static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024; +} // namespace detail + template struct RegBoundScaling { enum { ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), + BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, + ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), }; }; @@ -153,7 +161,8 @@ struct MemBoundScaling { ITEMS_PER_THREAD = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), + BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, + ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), }; }; diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh index ef0c178c56..cae7565ec7 100644 --- a/cub/cub/util_compiler.cuh +++ b/cub/cub/util_compiler.cuh @@ -44,45 +44,67 @@ #endif // no system header // enumerate host compilers we know about +//! deprecated [Since 2.7] #define CUB_HOST_COMPILER_UNKNOWN 0 -#define CUB_HOST_COMPILER_MSVC 1 -#define CUB_HOST_COMPILER_GCC 2 -#define CUB_HOST_COMPILER_CLANG 3 +//! deprecated [Since 2.7] +#define CUB_HOST_COMPILER_MSVC 1 +//! deprecated [Since 2.7] +#define CUB_HOST_COMPILER_GCC 2 +//! deprecated [Since 2.7] +#define CUB_HOST_COMPILER_CLANG 3 // enumerate device compilers we know about +//! deprecated [Since 2.7] #define CUB_DEVICE_COMPILER_UNKNOWN 0 -#define CUB_DEVICE_COMPILER_MSVC 1 -#define CUB_DEVICE_COMPILER_GCC 2 -#define CUB_DEVICE_COMPILER_NVCC 3 -#define CUB_DEVICE_COMPILER_CLANG 4 +//! deprecated [Since 2.7] +#define CUB_DEVICE_COMPILER_MSVC 1 +//! deprecated [Since 2.7] +#define CUB_DEVICE_COMPILER_GCC 2 +//! deprecated [Since 2.7] +#define CUB_DEVICE_COMPILER_NVCC 3 +//! deprecated [Since 2.7] +#define CUB_DEVICE_COMPILER_CLANG 4 // figure out which host compiler we're using #if defined(_CCCL_COMPILER_MSVC) -# define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC -# define CUB_MSVC_VERSION _MSC_VER -# define CUB_MSVC_VERSION_FULL _MSC_FULL_VER +//! deprecated [Since 2.7] +# define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC +//! deprecated [Since 2.7] +# define CUB_MSVC_VERSION _CCCL_MSVC_VERSION +//! deprecated [Since 2.7] +# define CUB_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL #elif defined(_CCCL_COMPILER_CLANG) +//! deprecated [Since 2.7] # define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG -# define CUB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) +//! deprecated [Since 2.7] +# define CUB_CLANG_VERSION _CCCL_CLANG_VERSION #elif defined(_CCCL_COMPILER_GCC) +//! deprecated [Since 2.7] # define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC -# define CUB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +//! deprecated [Since 2.7] +# define CUB_GCC_VERSION _CCCL_GCC_VERSION #endif // figure out which device compiler we're using #if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_CUDA_COMPILER_NVHPC) +//! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC #elif defined(_CCCL_COMPILER_MSVC) +//! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC #elif defined(_CCCL_COMPILER_GCC) +//! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC #elif defined(_CCCL_COMPILER_CLANG) // CUDA-capable clang should behave similar to NVCC. # if defined(_CCCL_CUDA_COMPILER_NVCC) +//! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC # else +//! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG # endif #else +//! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN #endif diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index d5beca2f6b..e34e253d3e 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file Detect the version of the C++ standard used by the compiler. +//! @file +//! Detect the version of the C++ standard used by the compiler. #pragma once @@ -39,7 +40,7 @@ # pragma system_header #endif // no system header -#include +#include // IWYU pragma: export #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document @@ -100,14 +101,14 @@ # ifndef CUB_IGNORE_DEPRECATED_COMPILER // Compiler checks: -# if defined(_CCCL_COMPILER_GCC) && CUB_GCC_VERSION < 50000 +# if defined(_CCCL_COMPILER_GCC) && _CCCL_GCC_VERSION < 50000 CUB_COMPILER_DEPRECATION(GCC 5.0); -# elif defined(_CCCL_COMPILER_CLANG) && CUB_CLANG_VERSION < 70000 +# elif defined(_CCCL_COMPILER_CLANG) && _CCCL_CLANG_VERSION < 70000 CUB_COMPILER_DEPRECATION(Clang 7.0); -# elif defined(_CCCL_COMPILER_MSVC) && CUB_MSVC_VERSION < 1910 +# elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1910 // <2017. Hard upgrade message: CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20)); -# elif defined(_CCCL_COMPILER_MSVC) && CUB_MSVC_VERSION < 1920 +# elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1920 // >=2017, <2019. Soft deprecation message: CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017); # endif diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 714aa014ce..7ea6dc3847 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -47,13 +47,13 @@ # pragma system_header #endif // no system header -#include +#include // IWYU pragma: export #include #include // for backward compatibility #include -#include +#include // IWYU pragma: export #include #include diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index 162641d348..37d4ed05f3 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -42,8 +42,8 @@ # pragma system_header #endif // no system header -#include -#include +#include // IWYU pragma: export +#include // IWYU pragma: export #include @@ -112,12 +112,12 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wattributes") # if !defined(_CCCL_CUDA_COMPILER_NVHPC) _CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) # endif // !_CCCL_CUDA_COMPILER_NVHPC -# if defined(_CCCL_COMPILER_ICC) || defined(_CCCL_COMPILER_ICC_LLVM) +# if defined(_CCCL_COMPILER_ICC) # pragma nv_diag_suppress 1407 // the "__visibility__" attribute can only appear on functions and // variables with external linkage' # pragma warning(disable : 1890) // the "__visibility__" attribute can only appear on functions and // variables with external linkage' -# endif // _CCCL_COMPILER_ICC || _CCCL_COMPILER_ICC_LLVM +# endif // _CCCL_COMPILER_ICC #endif // !CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION CUB_NAMESPACE_END diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index e23f6e6578..8ae4e2d05b 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -44,6 +44,7 @@ #endif // no system header #include +#include #include #include diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh index 6a0d6b9a94..d2e5541c09 100644 --- a/cub/cub/util_vsmem.cuh +++ b/cub/cub/util_vsmem.cuh @@ -42,6 +42,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -67,10 +68,6 @@ struct vsmem_t void* gmem_ptr; }; -// The maximum amount of static shared memory available per thread block -// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB -static constexpr std::size_t max_smem_per_block = 48 * 1024; - /** * @brief Class template that helps to prevent exceeding the available shared memory per thread block. * diff --git a/cub/cub/version.cuh b/cub/cub/version.cuh index e485eb3cd0..95755133e3 100644 --- a/cub/cub/version.cuh +++ b/cub/cub/version.cuh @@ -58,7 +58,7 @@ * CUB_VERSION / 100 % 1000 is the minor version. * CUB_VERSION / 100000 is the major version. */ -#define CUB_VERSION 200600 // macro expansion with ## requires this to be a single value +#define CUB_VERSION 200700 // macro expansion with ## requires this to be a single value /*! \def CUB_MAJOR_VERSION * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index 41b23e6dff..fdd4083c37 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -127,7 +127,7 @@ struct WarpReduceShfl { enum { - /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per + /// Whether the data type is a small (32b or less) integer for which we can use a single SHFL instruction per /// exchange IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) }; diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh index bfcef99656..ac5c700b95 100644 --- a/cub/cub/warp/warp_load.cuh +++ b/cub/cub/warp/warp_load.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file Operations for reading linear tiles of data into the CUDA warp. +//! @file +//! Operations for reading linear tiles of data into the CUDA warp. #pragma once diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh index e123330ba1..bb99bc5965 100644 --- a/cub/cub/warp/warp_store.cuh +++ b/cub/cub/warp/warp_store.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file Operations for writing linear segments of data from the CUDA warp +//! @file +//! Operations for writing linear segments of data from the CUDA warp #pragma once diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 48a0142801..3ec8c94eef 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -261,6 +261,11 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists endif() + # enable lambdas for all API examples + if ("${test_target}" MATCHES "test.[A-Za-z0-9_]+_api") + target_compile_options(${test_target} PRIVATE $<$:--extended-lambda>) + endif() + target_link_libraries(${test_target} PRIVATE ${cub_target} ${config_c2h_target} diff --git a/cub/test/c2h/checked_allocator.cuh b/cub/test/c2h/checked_allocator.cuh index 46f2601b1c..11d0d8ccbc 100644 --- a/cub/test/c2h/checked_allocator.cuh +++ b/cub/test/c2h/checked_allocator.cuh @@ -35,24 +35,62 @@ #include #include +#include +#include #include -// #define DEBUG_CHECKED_ALLOC_FAILURE - -#ifdef DEBUG_CHECKED_ALLOC_FAILURE -# include -#endif - namespace c2h { namespace detail { +struct memory_info +{ + std::size_t free{}; + std::size_t total{}; + bool override{false}; +}; + +// If the environment variable CCCL_DEVICE_MEMORY_LIMIT is set, the total device memory +// will be limited to this number of bytes. +inline std::size_t get_device_memory_limit() +{ + static const char* override_str = std::getenv("CCCL_DEVICE_MEMORY_LIMIT"); + static std::size_t result = override_str ? static_cast(std::atoll(override_str)) : 0; + return result; +} + +inline bool get_debug_checked_allocs() +{ + static const char* debug_checked_allocs = std::getenv("CCCL_DEBUG_CHECKED_ALLOC_FAILURES"); + static bool result = debug_checked_allocs && (std::atoi(debug_checked_allocs) != 0); + return result; +} + +inline cudaError_t get_device_memory(memory_info& info) +{ + static std::size_t device_memory_limit = get_device_memory_limit(); + + cudaError_t status = cudaMemGetInfo(&info.free, &info.total); + if (status != cudaSuccess) + { + return status; + } + + if (device_memory_limit > 0) + { + info.free = (std::max)(std::size_t{0}, static_cast(info.free - (info.total - device_memory_limit))); + info.total = device_memory_limit; + info.override = true; + } + + return cudaSuccess; +} + inline cudaError_t check_free_device_memory(std::size_t bytes) { - std::size_t free_bytes{}; - std::size_t total_bytes{}; - cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes); + memory_info info; + cudaError_t status = get_device_memory(info); if (status != cudaSuccess) { return status; @@ -60,20 +98,31 @@ inline cudaError_t check_free_device_memory(std::size_t bytes) // Avoid allocating all available memory: constexpr std::size_t padding = 16 * 1024 * 1024; // 16 MiB - if (free_bytes < (bytes + padding)) + if (info.free < (bytes + padding)) { -#ifdef DEBUG_CHECKED_ALLOC_FAILURE - const double total_GiB = static_cast(total_bytes) / (1024 * 1024 * 1024); - const double free_GiB = static_cast(free_bytes) / (1024 * 1024 * 1024); - const double requested_GiB = static_cast(bytes) / (1024 * 1024 * 1024); - const double padded_GiB = static_cast(bytes + padding) / (1024 * 1024 * 1024); - - std::cerr - << "Total device mem: " << total_GiB << " GiB\n" // - << "Free device mem: " << free_GiB << " GiB\n" // - << "Requested device mem: " << requested_GiB << " GiB\n" // - << "Padded device mem: " << padded_GiB << " GiB\n"; -#endif + if (get_debug_checked_allocs()) + { + const double total_GiB = static_cast(info.total) / (1024 * 1024 * 1024); + const double free_GiB = static_cast(info.free) / (1024 * 1024 * 1024); + const double requested_GiB = static_cast(bytes) / (1024 * 1024 * 1024); + const double padded_GiB = static_cast(bytes + padding) / (1024 * 1024 * 1024); + + std::cerr << "Device memory allocation failed due to insufficient free device memory.\n"; + + if (info.override) + { + std::cerr + << "Available device memory has been limited (env var CCCL_DEVICE_MEMORY_LIMIT=" << get_device_memory_limit() + << ").\n"; + } + + std::cerr + << "Total device mem: " << total_GiB << " GiB\n" // + << "Free device mem: " << free_GiB << " GiB\n" // + << "Requested device mem: " << requested_GiB << " GiB\n" // + << "Padded device mem: " << padded_GiB << " GiB\n"; + } + return cudaErrorMemoryAllocation; } diff --git a/cub/test/c2h/utility.cuh b/cub/test/c2h/utility.cuh index 2f4ac412ca..434f3cf51f 100644 --- a/cub/test/c2h/utility.cuh +++ b/cub/test/c2h/utility.cuh @@ -38,19 +38,6 @@ namespace c2h { -/** - * Return a value of type `T0` with the same bitwise representation of `in`. - * Types `To` and `From` must be the same size. - */ -template -__host__ __device__ To bit_cast(const From& in) -{ - static_assert(sizeof(To) == sizeof(From), "Types must be same size."); - To out; - memcpy(&out, &in, sizeof(To)); - return out; -} - // TODO(bgruber): duplicated version of thrust/testing/unittest/system.h inline std::string demangle(const char* name) { diff --git a/cub/test/catch2_main.cuh b/cub/test/catch2_main.cuh index fc08aa13eb..1d42355ce7 100644 --- a/cub/test/catch2_main.cuh +++ b/cub/test/catch2_main.cuh @@ -29,10 +29,10 @@ #include -//! @file This file includes a custom Catch2 main function. When CMake is configured to build -//! each test as a separate executable, this header is included into each test. On the other -//! hand, when all the tests are compiled into a single executable, this header is excluded -//! from the tests and included into catch2_runner.cpp +//! @file +//! This file includes a custom Catch2 main function. When CMake is configured to build each test as a separate +//! executable, this header is included into each test. On the other hand, when all the tests are compiled into a single +//! executable, this header is excluded from the tests and included into catch2_runner.cpp #ifdef CUB_CONFIG_MAIN # define CATCH_CONFIG_RUNNER diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh index 758253203b..61b02fc6f1 100644 --- a/cub/test/catch2_radix_sort_helper.cuh +++ b/cub/test/catch2_radix_sort_helper.cuh @@ -39,6 +39,8 @@ #include #include +#include + #include #include #include @@ -199,7 +201,7 @@ c2h::host_vector get_striped_keys(const c2h::host_vector& h_keys, in for (std::size_t i = 0; i < h_keys.size(); i++) { - bit_ordered_t key = c2h::bit_cast(h_keys[i]); + bit_ordered_t key = ::cuda::std::bit_cast(h_keys[i]); _CCCL_IF_CONSTEXPR (traits_t::CATEGORY == cub::FLOATING_POINT) { diff --git a/cub/test/catch2_runner.cpp b/cub/test/catch2_runner.cpp index 53a19f7b6a..73f3f70d8a 100644 --- a/cub/test/catch2_runner.cpp +++ b/cub/test/catch2_runner.cpp @@ -25,8 +25,9 @@ * ******************************************************************************/ -//! @file This file includes a custom Catch2 main function when CMake is configured to build -//! all tests into a single executable. +//! @file +//! This file includes a custom Catch2 main function when CMake is configured to build all tests into a single +//! executable. #define CUB_CONFIG_MAIN #define CUB_EXCLUDE_CATCH2_HELPER_IMPL diff --git a/cub/test/catch2_runner_helper.cu b/cub/test/catch2_runner_helper.cu index d16e09f742..628f9525f7 100644 --- a/cub/test/catch2_runner_helper.cu +++ b/cub/test/catch2_runner_helper.cu @@ -25,9 +25,9 @@ * ******************************************************************************/ -//! @file This file includes CUDA-specific utilities for custom Catch2 main function when CMake is -//! configured to build all tests into a single executable. In this case, we have to have -//! a CUDA target in the final Catch2 executable, otherwise CMake confuses linker options and -//! MSVC/RDC build fails. +//! @file +//! This file includes CUDA-specific utilities for custom Catch2 main function when CMake is configured to build all +//! tests into a single executable. In this case, we have to have a CUDA target in the final Catch2 executable, +//! otherwise CMake confuses linker options and MSVC/RDC build fails. #include "catch2_runner_helper.inl" diff --git a/cub/test/catch2_runner_helper.inl b/cub/test/catch2_runner_helper.inl index 3971760800..f8a2bfa2ab 100644 --- a/cub/test/catch2_runner_helper.inl +++ b/cub/test/catch2_runner_helper.inl @@ -27,10 +27,10 @@ #pragma once -//! @file This file includes implementation of CUDA-specific utilities for custom Catch2 main -//! When CMake is configured to include all the tests into a single executable, this file -//! is only included into catch2_runner_helper.cu. When CMake is configured to compile -//! each test as a separate binary, this file is included into each test. +//! @file +//! This file includes implementation of CUDA-specific utilities for custom Catch2 main When CMake is configured to +//! include all the tests into a single executable, this file is only included into catch2_runner_helper.cu. When CMake +//! is configured to compile each test as a separate binary, this file is included into each test. #include diff --git a/cub/test/catch2_test_block_load.cu b/cub/test/catch2_test_block_load.cu index 39bccc50c5..43fd75698f 100644 --- a/cub/test/catch2_test_block_load.cu +++ b/cub/test/catch2_test_block_load.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include "catch2_test_helper.h" @@ -113,7 +114,7 @@ void block_load(InputIteratorT input, OutputIteratorT output, int num_items) using input_t = cub::detail::value_t; using block_load_t = cub::BlockLoad; using storage_t = typename block_load_t::TempStorage; - constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48; + constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block; kernel <<<1, ThreadsInBlock>>>(std::integral_constant{}, input, output, num_items); diff --git a/cub/test/catch2_test_block_store.cu b/cub/test/catch2_test_block_store.cu index f157a28ea0..566dd2e828 100644 --- a/cub/test/catch2_test_block_store.cu +++ b/cub/test/catch2_test_block_store.cu @@ -29,6 +29,7 @@ #include #include #include +#include #include "catch2_test_helper.h" @@ -114,7 +115,7 @@ void block_store(InputIteratorT input, OutputIteratorT output, int num_items) using input_t = cub::detail::value_t; using block_store_t = cub::BlockStore; using storage_t = typename block_store_t::TempStorage; - constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48; + constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block; kernel <<<1, ThreadsInBlock>>>(std::integral_constant{}, input, output, num_items); diff --git a/cub/test/catch2_test_device_histogram.cu b/cub/test/catch2_test_device_histogram.cu index 5d43ebbc05..c0e2ee1fa9 100644 --- a/cub/test/catch2_test_device_histogram.cu +++ b/cub/test/catch2_test_device_histogram.cu @@ -29,10 +29,9 @@ #include #include -#include -#include -#include +#include #include +#include #include #include @@ -213,7 +212,7 @@ struct bit_and_anything _CCCL_HOST_DEVICE auto operator()(const T& a, const T& b) const -> T { using U = typename cub::Traits::UnsignedBits; - return c2h::bit_cast(static_cast(c2h::bit_cast(a) & c2h::bit_cast(b))); + return ::cuda::std::bit_cast(static_cast(::cuda::std::bit_cast(a) & ::cuda::std::bit_cast(b))); } }; diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu index 961361622d..24d60033e3 100644 --- a/cub/test/catch2_test_device_radix_sort_keys.cu +++ b/cub/test/catch2_test_device_radix_sort_keys.cu @@ -192,8 +192,8 @@ CUB_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sor using bits_t = typename cub::Traits::UnsignedBits; constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT; - const key_t positive_zero = c2h::bit_cast(bits_t(0)); - const key_t negative_zero = c2h::bit_cast(bits_t(1) << (num_bits - 1)); + const key_t positive_zero = ::cuda::std::bit_cast(bits_t(0)); + const key_t negative_zero = ::cuda::std::bit_cast(bits_t(1) << (num_bits - 1)); constexpr std::size_t max_num_items = 1 << 18; const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index 1e9e08c911..bfd7c3e8a2 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -24,7 +24,6 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ - #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first @@ -48,7 +47,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max); DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max); // %PARAM% TEST_LAUNCH lid 0:1:2 -// %PARAM% TEST_TYPES types 0:1:2:3 +// %PARAM% TEST_TYPES types 0:1:2:3:4 // List of types to test using custom_t = @@ -72,9 +71,13 @@ type_pair #endif #if TEST_BF_T , type_pair // testing bf16 -#endif + >; +#endif // clang-format on +#elif TEST_TYPES == 4 +// DPX SIMD instructions +using full_type_list = c2h::type_list, type_pair>; #endif /** @@ -124,6 +127,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f } auto d_in_it = thrust::raw_pointer_cast(in_items.data()); +#if TEST_TYPES != 4 SECTION("reduce") { using op_t = cub::Sum; @@ -132,7 +136,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; output_t expected_result = static_cast(compute_single_problem_reference(in_items, reduction_op, accum_t{})); @@ -145,6 +149,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f // Verify result REQUIRE(expected_result == out_result[0]); } +#endif // TEST_TYPES != 4 // Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due // to pseudo associativity of the addition operation over floating point numbers @@ -152,7 +157,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f SECTION("sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data output_t expected_result = static_cast(compute_single_problem_reference(in_items, op_t{}, accum_t{})); @@ -197,6 +202,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f REQUIRE(expected_result == out_result[0]); } +#if TEST_TYPES != 4 SECTION("argmax") { // Prepare verification data @@ -233,4 +239,5 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f REQUIRE(expected_result[0] == gpu_value); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key); } +#endif } diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu index 39f31d5e78..88c305fd36 100644 --- a/cub/test/catch2_test_device_reduce_by_key.cu +++ b/cub/test/catch2_test_device_reduce_by_key.cu @@ -116,7 +116,7 @@ CUB_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); c2h::host_vector expected_keys = compute_unique_keys_reference(segment_keys); diff --git a/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/cub/test/catch2_test_device_reduce_by_key_iterators.cu index 3637813b5f..14b7fcde9f 100644 --- a/cub/test/catch2_test_device_reduce_by_key_iterators.cu +++ b/cub/test/catch2_test_device_reduce_by_key_iterators.cu @@ -90,7 +90,7 @@ CUB_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]" using op_t = cub::Sum; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin()); c2h::host_vector expected_keys = compute_unique_keys_reference(segment_keys); diff --git a/cub/test/catch2_test_device_reduce_iterators.cu b/cub/test/catch2_test_device_reduce_iterators.cu index 7c7f74ec63..ab1dabbbb1 100644 --- a/cub/test/catch2_test_device_reduce_iterators.cu +++ b/cub/test/catch2_test_device_reduce_iterators.cu @@ -104,7 +104,7 @@ CUB_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{}); // Run test diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu index 49c9aac39c..736e217b0e 100644 --- a/cub/test/catch2_test_device_scan.cu +++ b/cub/test/catch2_test_device_scan.cu @@ -127,7 +127,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -155,7 +155,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -184,7 +184,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive scan") { using op_t = cub::Min; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -213,7 +213,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive scan with init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); @@ -248,7 +248,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive scan") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); @@ -281,7 +281,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive scan with future-init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh index d3644e3387..dc5b7804e8 100644 --- a/cub/test/catch2_test_device_scan.cuh +++ b/cub/test/catch2_test_device_scan.cuh @@ -61,7 +61,7 @@ template ; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -75,7 +75,7 @@ template ; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -101,7 +101,7 @@ void compute_exclusive_scan_by_key_reference( std::size_t num_items) { using value_t = cub::detail::value_t; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; if (num_items > 0) @@ -152,7 +152,7 @@ void compute_inclusive_scan_by_key_reference( std::size_t num_items) { using value_t = cub::detail::value_t; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; for (std::size_t i = 0; i < num_items;) diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu index 576d0d3f74..a07397cc36 100644 --- a/cub/test/catch2_test_device_scan_iterators.cu +++ b/cub/test/catch2_test_device_scan_iterators.cu @@ -84,7 +84,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("inclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -102,7 +102,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -120,7 +120,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("inclusive scan") { using op_t = cub::Min; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -139,7 +139,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive scan") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -157,7 +157,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive scan with future-init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data accum_t init_value{}; diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu index 9d00d89e14..0c0854e21e 100644 --- a/cub/test/catch2_test_device_scan_large_offsets.cu +++ b/cub/test/catch2_test_device_scan_large_offsets.cu @@ -35,33 +35,12 @@ #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" -// TODO(elstehle) replace with DeviceScan interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceScan algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_scan_wrapper( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream = 0) -{ - using init_value_t = cub::detail::InputValue; - init_value_t init_value_wrapper{init_value}; - - return cub::DispatchScan::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value_wrapper, num_items, stream); -} - -DECLARE_LAUNCH_WRAPPER(dispatch_scan_wrapper, dispatch_exclusive_scan); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan); // %PARAM% TEST_LAUNCH lid 0:1:2 -// TODO(elstehle) replace with actual offset types, once https://github.com/NVIDIA/cccl/issues/50 is addresed // List of offset types to be used for testing large number of items -using offset_types = c2h::type_list; +using offset_types = c2h::type_list; template struct expected_sum_op @@ -106,12 +85,12 @@ try offset_t num_items_max = static_cast(num_items_max_ull); offset_t num_items_min = num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; - // TODO(elstehle) remove single-item size, once https://github.com/NVIDIA/cccl/issues/50 is addresed - offset_t num_items = - GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), static_cast(1)}), - take(2, random(num_items_min, num_items_max))); + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); - // Prepare input + // Prepare input (generate a series of: 0, 1, 2, ..., , 0, 1, 2, ..., , 0, 1, ...) constexpr index_t segment_size = 1000; auto index_it = thrust::make_counting_iterator(index_t{}); auto items_it = thrust::make_transform_iterator(index_it, mod_op{segment_size}); @@ -120,8 +99,12 @@ try c2h::device_vector d_items_out(num_items); auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data()); + c2h::device_vector d_initial_value(1); + d_initial_value[0] = item_t{}; + auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + // Run test - dispatch_exclusive_scan(items_it, d_items_out_it, op_t{}, item_t{}, num_items); + device_exclusive_scan(items_it, d_items_out_it, op_t{}, future_init_value, num_items); // Ensure that we created the correct output auto expected_out_it = diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index 770b85b019..5559e7e2e8 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -121,7 +121,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_items, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); @@ -142,7 +142,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ SECTION("sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_segments); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index 8ab495ddc5..a81559b91e 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -93,7 +93,7 @@ CUB_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_it, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu new file mode 100644 index 0000000000..50f253ef5c --- /dev/null +++ b/cub/test/catch2_test_device_transform.cu @@ -0,0 +1,556 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "c2h/custom_type.cuh" +#include "catch2_test_helper.h" +#include "catch2_test_launch_helper.h" +#include "test/test_util_vec.h" + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +using cub::detail::transform::Algorithm; + +template +struct policy_hub_for_alg +{ + struct max_policy : cub::ChainedPolicy<300, max_policy, max_policy> + { + static constexpr int min_bif = 64 * 1024; + static constexpr Algorithm algorithm = Alg; + using algo_policy = + ::cuda::std::_If>; + }; +}; + +template +CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point( + void* d_temp_storage, + size_t& temp_storage_bytes, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + Offset num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) +{ + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + constexpr bool RequiresStableAddress = false; + return cub::detail::transform::dispatch_t, + RandomAccessIteratorOut, + TransformOp, + policy_hub_for_alg>{} + .dispatch(inputs, output, num_items, transform_op, stream); +} + +DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many); +DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::TransformStableArgumentAddresses, transform_many_stable); +DECLARE_TMPL_LAUNCH_WRAPPER(transform_many_with_alg_entry_point, + transform_many_with_alg, + ESCAPE_LIST(Algorithm Alg, typename Offset), + ESCAPE_LIST(Alg, Offset)); + +using algorithms = + c2h::enum_type_list; + +using offset_types = c2h::type_list; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP +# define FILTER_UBLKCP \ + if (alg == Algorithm::ublkcp && ptx_version < 900) \ + { \ + return; \ + } +#else // _CUB_HAS_TRANSFORM_UBLKCP +# define FILTER_UBLKCP +#endif // _CUB_HAS_TRANSFORM_UBLKCP + +#define FILTER_UNSUPPORTED_ALGS \ + int ptx_version = 0; \ + REQUIRE(cub::PtxVersion(ptx_version) == cudaSuccess); \ + _CCCL_DIAG_PUSH \ + _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ \ + FILTER_UBLKCP \ + _CCCL_DIAG_POP + +CUB_TEST("DeviceTransform::Transform BabelStream add", + "[device][device_transform]", + c2h::type_list, + offset_types, + algorithms) +{ + using type = typename c2h::get<0, TestType>; + using offset_t = typename c2h::get<1, TestType>; + constexpr auto alg = c2h::get<2, TestType>::value; + FILTER_UNSUPPORTED_ALGS + const int num_items = GENERATE(0, 1, 15, 16, 17, 127, 128, 129, 4095, 4096, 4097); // edge cases around 16 and 128 + CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg, num_items); + + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + c2h::gen(CUB_SEED(1), a); + c2h::gen(CUB_SEED(1), b); + + c2h::device_vector result(num_items); + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus{}); + + // compute reference and verify + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +template +struct alignas(Alignment) overaligned_addable_t +{ + int value; + + overaligned_addable_t() = default; + + _CCCL_HOST_DEVICE overaligned_addable_t(int val) + : value{val} + {} + + _CCCL_HOST_DEVICE static void check(const overaligned_addable_t& obj) + { + if (reinterpret_cast(&obj) % Alignment != 0) + { + printf("Error: object not aligned to %d: %p\n", Alignment, &obj); + ::cuda::std::terminate(); + } + } + + _CCCL_HOST_DEVICE friend auto operator==(const overaligned_addable_t& a, const overaligned_addable_t& b) -> bool + { + check(a); + check(b); + return a.value == b.value; + } + + _CCCL_HOST_DEVICE friend auto + operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t + { + check(a); + check(b); + return overaligned_addable_t{a.value + b.value}; + } + + _CCCL_HOST friend auto operator<<(std::ostream& os, const overaligned_addable_t& obj) -> std::ostream& + { + check(obj); + return os << "over{" << obj.value << "}"; + } +}; + +using overaligned_types = + c2h::type_list +#ifndef _CCCL_COMPILER_MSVC // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned + , + overaligned_addable_t<256> +#endif // _CCCL_COMPILER_MSVC + >; + +// test with types exceeding the memcpy_async and bulk copy alignments (16 and 128 bytes respectively) +CUB_TEST("DeviceTransform::Transform overaligned type", "[device][device_transform]", overaligned_types) +{ + using type = c2h::get<0, TestType>; + CAPTURE(c2h::demangle(typeid(type).name())); + + const int num_items = GENERATE(0, 1, 100, 1000); + c2h::device_vector a(num_items, 3); // put some integers at the front, so SMEM has to handle different alignments + c2h::device_vector b(num_items, 4); + + c2h::device_vector result(num_items); + // we need raw pointers here to halfen the conversion sequence from device_reference -> int -> type when calling + // plus(...), which is too long to compile + transform_many(::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())), + result.begin(), + num_items, + ::cuda::std::plus{}); + + REQUIRE(result == c2h::device_vector(num_items, 7)); +} + +CUB_TEST("DeviceTransform::Transform huge type", "[device][device_transform]") +{ + using huge_t = c2h::custom_type_t::type>; + static_assert(alignof(huge_t) == 8, "Need a large type with alignment < 16"); + CAPTURE(c2h::demangle(typeid(huge_t).name())); + + const int num_items = GENERATE(0, 1, 100, 1000); + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + c2h::gen(CUB_SEED(1), a); + c2h::gen(CUB_SEED(1), b); + + c2h::device_vector result(num_items); + transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus{}); + + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(result == reference_h); +} + +struct times_seven +{ + _CCCL_HOST_DEVICE auto operator()(unsigned char v) const -> char + { + return static_cast(v * 7); + } +}; + +CUB_TEST("DeviceTransform::Transform with large input", "[device][device_transform]", algorithms) +try +{ + using type = unsigned char; + using offset_t = cuda::std::int64_t; + constexpr auto alg = c2h::get<0, TestType>::value; + FILTER_UNSUPPORTED_ALGS + CAPTURE(alg); + + constexpr offset_t num_items = (offset_t{1} << 32) + 123456; // a few thread blocks beyond 4GiB + c2h::device_vector input(num_items); + c2h::gen(CUB_SEED(1), input); + + c2h::device_vector result(num_items); + transform_many_with_alg( + ::cuda::std::make_tuple(input.begin()), result.begin(), num_items, times_seven{}); + + // compute reference and verify + c2h::host_vector input_h = input; + c2h::host_vector reference_h(num_items); + std::transform(input_h.begin(), input_h.end(), reference_h.begin(), times_seven{}); + REQUIRE((reference_h == result)); +} +catch (const std::bad_alloc&) +{ + // allocation failure is not a test failure, so we can run tests on smaller GPUs +} + +template +struct nstream_kernel +{ + static constexpr T scalar = 42; + + _CCCL_HOST_DEVICE T operator()(const T& ai, const T& bi, const T& ci) const + { + return ai + bi + scalar * ci; + } +}; + +// overwrites one input stream +CUB_TEST("DeviceTransform::Transform BabelStream nstream", + "[device][device_transform]", + c2h::type_list, + offset_types, + algorithms) +{ + using type = typename c2h::get<0, TestType>; + using offset_t = typename c2h::get<1, TestType>; + constexpr auto alg = c2h::get<2, TestType>::value; + FILTER_UNSUPPORTED_ALGS + CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg); + + const int num_items = GENERATE(0, 1, 100, 1000, 10000); + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + c2h::device_vector c(num_items); + c2h::gen(CUB_SEED(1), a, type{10}, type{100}); + c2h::gen(CUB_SEED(1), b, type{10}, type{100}); + c2h::gen(CUB_SEED(1), c, type{10}, type{100}); + + // copy to host before changing + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector c_h = c; + + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), num_items, nstream_kernel{}); + + // compute reference and verify + auto z = thrust::make_zip_iterator(a_h.begin(), b_h.begin(), c_h.begin()); + std::transform(z, z + num_items, a_h.begin(), thrust::make_zip_function(nstream_kernel{})); + REQUIRE(a_h == a); +} + +struct sum_five +{ + __device__ auto operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double + { + return a + b + c + d + e; + } +}; + +CUB_TEST("DeviceTransform::Transform add five streams", "[device][device_transform]", algorithms) +{ + using offset_t = int; + constexpr auto alg = c2h::get<0, TestType>::value; + FILTER_UNSUPPORTED_ALGS + + constexpr int num_items = 100; + c2h::device_vector a(num_items, 1); + c2h::device_vector b(num_items, 2); + c2h::device_vector c(num_items, 3); + c2h::device_vector d(num_items, 4); + c2h::device_vector e(num_items, 5); + + c2h::device_vector result(num_items); + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()), + result.begin(), + num_items, + sum_five{}); + + // compute reference and verify + c2h::device_vector reference(num_items, 1 + 2 + 3 + 4 + 5); + REQUIRE(reference == result); +} + +struct give_me_five +{ + __device__ auto operator()() const -> int + { + return 5; + } +}; + +CUB_TEST("DeviceTransform::Transform no streams", "[device][device_transform]") +{ + constexpr int num_items = 100; + c2h::device_vector result(num_items); + transform_many(::cuda::std::tuple<>{}, result.begin(), num_items, give_me_five{}); + + // compute reference and verify + c2h::device_vector reference(num_items, 5); + REQUIRE(reference == result); +} + +CUB_TEST("DeviceTransform::Transform fancy input iterator types", "[device][device_transform]") +{ + using type = int; + + constexpr int num_items = 100; + thrust::counting_iterator a{0}; + thrust::counting_iterator b{10}; + + c2h::device_vector result(num_items); + transform_many(::cuda::std::make_tuple(a, b), result.begin(), num_items, ::cuda::std::plus{}); + + // compute reference and verify + c2h::host_vector reference_h(num_items); + std::transform(a, a + num_items, b, reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +CUB_TEST("DeviceTransform::Transform fancy output iterator type", "[device][device_transform]", algorithms) +{ + using type = int; + using offset_t = int; + constexpr auto alg = c2h::get<0, TestType>::value; + FILTER_UNSUPPORTED_ALGS + + constexpr int num_items = 100; + c2h::device_vector a(num_items, 13); + c2h::device_vector b(num_items, 35); + c2h::device_vector result(num_items); + + using thrust::placeholders::_1; + auto out = thrust::make_transform_output_iterator(result.begin(), _1 + 4); + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin()), out, num_items, ::cuda::std::plus{}); + REQUIRE(result == c2h::device_vector(num_items, (13 + 35) + 4)); +} + +CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][device_transform]") +{ + using type = int; + + constexpr int num_items = 100; + thrust::counting_iterator a{0}; + c2h::device_vector b(num_items, 10); + + c2h::device_vector result(num_items); + transform_many(::cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, ::cuda::std::plus{}); + + // compute reference and verify + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a, a + num_items, b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +struct plus_needs_stable_address +{ + int* a; + int* b; + + _CCCL_HOST_DEVICE int operator()(const int& v) const + { + const auto i = &v - a; + return v + b[i]; + } +}; + +CUB_TEST("DeviceTransform::Transform address stability", "[device][device_transform]") +{ + using type = int; + + constexpr int num_items = 100; + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + thrust::sequence(a.begin(), a.end()); + thrust::sequence(b.begin(), b.end(), 42); + + c2h::device_vector result(num_items); + transform_many_stable( + ::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data())), + result.begin(), + num_items, + plus_needs_stable_address{thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())}); + + // compute reference and verify + c2h::device_vector a_h = a; + c2h::device_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +// Non-trivially-copyable/relocatable type which cannot be copied using std::memcpy or cudaMemcpy +struct non_trivial +{ + int data; + + non_trivial() = default; + + _CCCL_HOST_DEVICE explicit non_trivial(int data) + : data(data) + {} + + _CCCL_HOST_DEVICE non_trivial(const non_trivial& nt) + : data(nt.data) + {} + + _CCCL_HOST_DEVICE auto operator=(const non_trivial& nt) -> non_trivial& + { + data = nt.data; + return *this; + } + + _CCCL_HOST_DEVICE auto operator-() const -> non_trivial + { + return non_trivial{-data}; + } + + friend _CCCL_HOST_DEVICE auto operator==(non_trivial a, non_trivial b) -> bool + { + return a.data == b.data; + } +}; +static_assert(!::cuda::std::is_trivially_copyable::value, ""); // as required by the standard +static_assert(!thrust::is_trivially_relocatable::value, ""); // CUB uses this check internally + +// Note(bgruber): I gave up on writing a test that checks whether the copy ctor/assignment operator is actually called +// (e.g. by tracking/counting invocations of those), since C++ allows (but not guarantees) elision of these operations. +// Also thrust algorithms perform a lot of copies in-between, so the test needs to use only raw allocations and +// iteration for setup and checking. +CUB_TEST("DeviceTransform::Transform not trivially relocatable", "[device][device_transform]") +{ + constexpr int num_items = 100; + c2h::device_vector input(num_items, non_trivial{42}); + c2h::device_vector result(num_items); + transform_many( + ::cuda::std::make_tuple(thrust::raw_pointer_cast(input.data())), result.begin(), num_items, ::cuda::std::negate<>{}); + + const auto reference = c2h::device_vector(num_items, non_trivial{-42}); + REQUIRE((reference == result)); +} + +CUB_TEST("DeviceTransform::Transform buffer start alignment", + "[device][device_transform]", + c2h::type_list) +{ + using type = typename c2h::get<0, TestType>; + + constexpr int num_items = 1000; + const int offset = GENERATE(1, 2, 4, 8, 16, 32, 64, 128); // global memory is always at least 256 byte aligned + CAPTURE(c2h::demangle(typeid(type).name()), offset); + c2h::device_vector input(num_items); + thrust::sequence(input.begin(), input.end()); + c2h::device_vector result(num_items); + using thrust::placeholders::_1; + transform_many(::cuda::std::make_tuple(input.begin() + offset), + result.begin() + offset, + num_items - offset, + _1 * 10); // FIXME(bgruber): does not work on negative + + c2h::device_vector reference(num_items); + thrust::tabulate(reference.begin() + offset, reference.end(), (_1 + offset) * 10); + REQUIRE(reference == result); +} + +namespace Catch +{ +template +struct StringMaker> +{ + static auto convert(cub::detail::transform::aligned_base_ptr abp) -> std::string + { + std::stringstream ss; + ss << "{ptr: " << abp.ptr << ", head_padding: " << abp.head_padding << "}"; + return ss.str(); + } +}; +} // namespace Catch + +// TODO(bgruber): rewrite this example using int3 +CUB_TEST("DeviceTransform::Transform aligned_base_ptr", "[device][device_transform]") +{ + alignas(128) int arr[256]; + using namespace cub::detail::transform; + CHECK(make_aligned_base_ptr(&arr[0], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 0}); + CHECK(make_aligned_base_ptr(&arr[1], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 4}); + CHECK(make_aligned_base_ptr(&arr[5], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 20}); + CHECK(make_aligned_base_ptr(&arr[31], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 124}); + CHECK(make_aligned_base_ptr(&arr[32], 128) == aligned_base_ptr{reinterpret_cast(&arr[32]), 0}); + CHECK(make_aligned_base_ptr(&arr[33], 128) == aligned_base_ptr{reinterpret_cast(&arr[32]), 4}); + CHECK(make_aligned_base_ptr(&arr[127], 128) == aligned_base_ptr{reinterpret_cast(&arr[96]), 124}); + CHECK(make_aligned_base_ptr(&arr[128], 128) == aligned_base_ptr{reinterpret_cast(&arr[128]), 0}); + CHECK(make_aligned_base_ptr(&arr[129], 128) == aligned_base_ptr{reinterpret_cast(&arr[128]), 4}); +} diff --git a/cub/test/catch2_test_device_transform_api.cu b/cub/test/catch2_test_device_transform_api.cu new file mode 100644 index 0000000000..46388ed6b2 --- /dev/null +++ b/cub/test/catch2_test_device_transform_api.cu @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include +#include + +#include "catch2_test_helper.h" + +// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows +void test_transform_api() +{ + // example-begin transform-many + constexpr auto num_items = 4; + auto input1 = thrust::device_vector{0, -2, 5, 3}; + auto input2 = thrust::device_vector{5.2f, 3.1f, -1.1f, 3.0f}; + auto input3 = thrust::counting_iterator{100}; + auto op = [] __device__(int a, float b, int c) { + return (a + b) * c; + }; + + auto result = thrust::device_vector(num_items); + cub::DeviceTransform::Transform( + ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), num_items, op); + + const auto expected = thrust::host_vector{520, 111, 397, 618}; + // example-end transform-many + CHECK(result == expected); +} + +CUB_TEST("DeviceTransform::Transform API example", "[device][device_transform]") +{ + test_transform_api(); +} + +// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows +void test_transform_stable_api() +{ + // example-begin transform-many-stable + constexpr auto num_items = 4; + auto input1 = thrust::device_vector{0, -2, 5, 3}; + auto input2 = thrust::device_vector{52, 31, -11, 30}; + + auto* input1_ptr = thrust::raw_pointer_cast(input1.data()); + auto* input2_ptr = thrust::raw_pointer_cast(input2.data()); + + auto op = [input1_ptr, input2_ptr] __device__(const int& a) -> int { + const auto i = &a - input1_ptr; // we depend on the address of a + return a + input2_ptr[i]; + }; + + auto result = thrust::device_vector(num_items); + cub::DeviceTransform::TransformStableArgumentAddresses( + ::cuda::std::make_tuple(input1_ptr), result.begin(), num_items, op); + + const auto expected = thrust::host_vector{52, 29, -6, 33}; + // example-end transform-many-stable + CHECK(result == expected); +} + +CUB_TEST("DeviceTransform::TransformStableArgumentAddresses API example", "[device][device_transform]") +{ + test_transform_stable_api(); +} diff --git a/cub/test/catch2_test_helper.h b/cub/test/catch2_test_helper.h index e62b98adf1..7689c416f3 100644 --- a/cub/test/catch2_test_helper.h +++ b/cub/test/catch2_test_helper.h @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -41,6 +42,7 @@ _CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variableds #endif // nvcc-11 +#include #include #include @@ -133,8 +135,8 @@ struct bitwise_equal bool operator()(const T& a, const T& b) const { using bits_t = typename cub::Traits::UnsignedBits; - bits_t a_bits = c2h::bit_cast(a); - bits_t b_bits = c2h::bit_cast(b); + bits_t a_bits = ::cuda::std::bit_cast(a); + bits_t b_bits = ::cuda::std::bit_cast(b); return a_bits == b_bits; } }; @@ -250,10 +252,22 @@ struct Catch::StringMaker #define CUB_TEST_STR(a) #a +namespace detail +{ +inline std::size_t adjust_seed_count(std::size_t requested) +{ + // Setting this environment variable forces a fixed number of seeds to be generated, regardless of the requested + // count. Set to 1 to reduce redundant, expensive testing when using sanitizers, etc. + static const char* override_str = std::getenv("CCCL_SEED_COUNT_OVERRIDE"); + static int override = override_str ? std::atoi(override_str) : 0; + return override_str ? override : requested; +} +} // namespace detail + #define CUB_SEED(N) \ c2h::seed_t \ { \ GENERATE_COPY(take( \ - N, \ + detail::adjust_seed_count(N), \ random(std::numeric_limits::min(), std::numeric_limits::max()))) \ } diff --git a/cub/test/catch2_test_launch_helper.h b/cub/test/catch2_test_launch_helper.h index 311fea93b1..4add1d15d1 100644 --- a/cub/test/catch2_test_launch_helper.h +++ b/cub/test/catch2_test_launch_helper.h @@ -31,7 +31,8 @@ #include "catch2_test_helper.h" -//! @file This file contains utilities for device-scope API tests +//! @file +//! This file contains utilities for device-scope API tests //! //! Device-scope API in CUB can be launched from the host or device side. //! Utilities in this file facilitate testing in both cases. @@ -73,7 +74,7 @@ //! Consult with `test/catch2_test_cdp_wrapper.cu` for more usage examples. #if !defined(TEST_LAUNCH) -# error Test file should contain %PARAM% TEST_LAUNCH lid 0:1 +# error Test file should contain %PARAM% TEST_LAUNCH lid 0:1:2 #endif #define DECLARE_INVOCABLE(API, WRAPPED_API_NAME, TMPL_HEAD_OPT, TMPL_ARGS_OPT) \ diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu index 466c3fa978..8dddb38c57 100644 --- a/cub/test/catch2_test_nvrtc.cu +++ b/cub/test/catch2_test_nvrtc.cu @@ -55,6 +55,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]") "#include \n" "#include \n" "#include \n" + "#include \n" " \n" "extern \"C\" __global__ void kernel(int *ptr, int *errors) \n" "{ \n" @@ -225,10 +226,11 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]") int ptx_version{}; cub::PtxVersion(ptx_version); const std::string arch = std::string("-arch=sm_") + std::to_string(ptx_version / 10); + const std::string std = std::string("-std=c++") + std::to_string(_CCCL_STD_VER - 2000); - constexpr int num_includes = 5; + constexpr int num_includes = 6; const char* includes[num_includes] = { - NVRTC_CUB_PATH, NVRTC_THRUST_PATH, NVRTC_LIBCUDACXX_PATH, NVRTC_CTK_PATH, arch.c_str()}; + NVRTC_CUB_PATH, NVRTC_THRUST_PATH, NVRTC_LIBCUDACXX_PATH, NVRTC_CTK_PATH, arch.c_str(), std.c_str()}; std::size_t log_size{}; nvrtcResult compile_result = nvrtcCompileProgram(prog, num_includes, includes); diff --git a/cub/test/test_block_radix_rank.cu b/cub/test/test_block_radix_rank.cu index 6d36378882..8c1df1a80c 100644 --- a/cub/test/test_block_radix_rank.cu +++ b/cub/test/test_block_radix_rank.cu @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -240,7 +241,7 @@ void Test() cub::detail::block_radix_rank_t; using storage_t = typename block_radix_rank::TempStorage; - cub::Int2Type<(sizeof(storage_t) <= 48 * 1024)> fits_smem_capacity; + cub::Int2Type<(sizeof(storage_t) <= cub::detail::max_smem_per_block)> fits_smem_capacity; TestValid(fits_smem_capacity); } diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake index 53cf7b8af4..0b2c0a2737 100644 --- a/cudax/cmake/cudaxBuildCompilerTargets.cmake +++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake @@ -9,7 +9,6 @@ include("${cudax_SOURCE_DIR}/cmake/AppendOptionIfAvailable.cmake") function(cudax_build_compiler_targets) - set(cxx_compile_definitions LIBCUDACXX_ENABLE_EXCEPTIONS) set(cxx_compile_options) set(cuda_compile_options) @@ -66,6 +65,8 @@ function(cudax_build_compiler_targets) # GCC 7.3 complains about name mangling changes due to `noexcept` append_option_if_available("-Wno-noexcept-type" cxx_compile_options) + + append_option_if_available("-Wmissing-field-initializers" cxx_compile_options) endif() if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake index 29a3bd58ca..824b1a4fda 100644 --- a/cudax/cmake/cudaxHeaderTesting.cmake +++ b/cudax/cmake/cudaxHeaderTesting.cmake @@ -14,6 +14,12 @@ file(GLOB_RECURSE headers "${cudax_SOURCE_DIR}/include/*.h" ) +# The following internal headers are not required to compile independently: +list(REMOVE_ITEM headers + "cuda/experimental/__async/prologue.cuh" + "cuda/experimental/__async/epilogue.cuh" +) + set(headertest_srcs) foreach (header IN LISTS headers) set(headertest_src "headers/${header}.cu") diff --git a/cudax/cmake/header_test.in.cu b/cudax/cmake/header_test.in.cu index 771ca319db..fd2df1987d 100644 --- a/cudax/cmake/header_test.in.cu +++ b/cudax/cmake/header_test.in.cu @@ -34,7 +34,9 @@ #define I CUDAX_MACRO_CHECK('I', complex.h) // windows.h conflicts -#define small CUDAX_MACRO_CHECK('small', windows.h) +// @eniebler 2024-08-30: This test is disabled because it causes build +// failures in some configurations. +// #define small CUDAX_MACRO_CHECK('small', windows.h) // We can't enable these checks without breaking some builds -- some standard // library implementations unconditionally `#undef` these macros, which then // causes random failures later. diff --git a/cudax/include/cuda/experimental/__async/async.cuh b/cudax/include/cuda/experimental/__async/async.cuh new file mode 100644 index 0000000000..ed53717bca --- /dev/null +++ b/cudax/include/cuda/experimental/__async/async.cuh @@ -0,0 +1,48 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ASYNC_DETAIL_ASYNC +#define __CUDAX_ASYNC_DETAIL_ASYNC + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// Include this first +#include + +// Include the other implementation headers: +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#endif // __CUDAX_ASYNC_DETAIL_ASYNC diff --git a/cudax/include/cuda/experimental/__async/basic_sender.cuh b/cudax/include/cuda/experimental/__async/basic_sender.cuh new file mode 100644 index 0000000000..5730078ecc --- /dev/null +++ b/cudax/include/cuda/experimental/__async/basic_sender.cuh @@ -0,0 +1,255 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ASYNC_DETAIL_BASIC_SENDER +#define __CUDAX_ASYNC_DETAIL_BASIC_SENDER + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include +#include +#include + +#include + +namespace cuda::experimental::__async +{ +template +struct __state +{ + _Data __data_; + _Rcvr __receiver_; +}; + +struct receiver_defaults +{ + using receiver_concept = __async::receiver_t; + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_value(__ignore, _Rcvr& __rcvr, _Args&&... __args) noexcept + -> __async::completion_signatures<__async::set_value_t(_Args...)> + { + __async::set_value(static_cast<_Rcvr&&>(__rcvr), static_cast<_Args&&>(__args)...); + return {}; + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_error(__ignore, _Rcvr& __rcvr, _Error&& __error) noexcept + -> __async::completion_signatures<__async::set_error_t(_Error)> + { + __async::set_error(static_cast<_Rcvr&&>(__rcvr), static_cast<_Error&&>(__error)); + return {}; + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto + set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()> + { + __async::set_stopped(static_cast<_Rcvr&&>(__rcvr)); + return {}; + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static decltype(auto) get_env(__ignore, const _Rcvr& __rcvr) noexcept + { + return __async::get_env(__rcvr); + } +}; + +template +struct basic_receiver +{ + using receiver_concept = __async::receiver_t; + using __rcvr_t = typename _Data::receiver_tag; + __state<_Data, _Rcvr>& __state_; + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_Args&&... __args) noexcept + { + __rcvr_t::set_value(__state_.__data_, __state_.__receiver_, (_Args&&) __args...); + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept + { + __rcvr_t::set_error(__state_.__data_, __state_.__receiver_, (_Error&&) __error); + } + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept + { + __rcvr_t::set_stopped(__state_.__data_, __state_.__receiver_); + } + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept + { + return __rcvr_t::get_env(__state_.__data_, __state_.__receiver_); + } +}; + +template +_CCCL_INLINE_VAR constexpr bool has_no_environment = _CUDA_VSTD::is_same_v<_Rcvr, receiver_archetype>; + +template +struct __mk_completions +{ + using __rcvr_t = typename _Data::receiver_tag; + + template + using __set_value_t = + decltype(+*__rcvr_t::set_value(__declval<_Data&>(), __declval(), __declval<_Args>()...)); + + template + using __set_error_t = + decltype(+*__rcvr_t::set_error(__declval<_Data&>(), __declval(), __declval<_Error>())); + + using __set_stopped_t = __async::completion_signatures<>; +}; + +template +struct __mk_completions : __mk_completions +{ + using __rcvr_t = typename _Data::receiver_tag; + + using __set_stopped_t = decltype(+*__rcvr_t::set_stopped(__declval<_Data&>(), __declval())); +}; + +template +using __ignore_value_signature = __async::completion_signatures<>; + +template +using __ignore_error_signature = __async::completion_signatures<>; + +template +constexpr bool __has_stopped = + !_CUDA_VSTD::is_same_v<__async::completion_signatures<>, + __async::transform_completion_signatures<_Completions, + __async::completion_signatures<>, + __ignore_value_signature, + __ignore_error_signature>>; + +template +void set_current_exception_if([[maybe_unused]] _Rcvr& __rcvr) noexcept +{ + if constexpr (_PotentiallyThrowing) + { + __async::set_error(static_cast<_Rcvr&&>(__rcvr), ::std::current_exception()); + } +} + +// A generic type that holds the data for an async operation, and +// that provides a `start` method for enqueuing the work. +template +struct __basic_opstate +{ + using __rcvr_t = basic_receiver<_Data, _Rcvr>; + using __completions_t = completion_signatures_of_t<_Sndr, __rcvr_t>; + using __traits_t = __mk_completions<__has_stopped<__completions_t>, _Data, _Rcvr>; + + using completion_signatures = // + transform_completion_signatures<__completions_t, + // TODO: add set_error_t(exception_ptr) if constructing + // the state or connecting the sender is potentially throwing. + __async::completion_signatures<>, + __traits_t::template __set_value_t, + __traits_t::template __set_error_t, + typename __traits_t::__set_stopped_t>; + + _CCCL_HOST_DEVICE __basic_opstate(_Sndr&& __sndr, _Data __data, _Rcvr __rcvr) + : __state_{static_cast<_Data&&>(__data), static_cast<_Rcvr&&>(__rcvr)} + , __op_(__async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{__state_})) + {} + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void start() noexcept + { + __async::start(__op_); + } + + __state<_Data, _Rcvr> __state_; + __async::connect_result_t<_Sndr, __rcvr_t> __op_; +}; + +template +_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr) +{ + auto [__tag, __data, __child] = static_cast<_Sndr&&>(__sndr); + using __data_t = decltype(__data); + using __child_t = decltype(__child); + (void) __tag; + return __basic_opstate( + static_cast<__child_t&&>(__child), static_cast<__data_t&&>(__data), static_cast<_Rcvr&&>(__rcvr)); +} + +template +_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto +__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...)) +{ + return __data.get_attrs(__sndrs...); +} + +template +_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto +__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...)) +{ + return __async::get_env(__sndrs...); +} + +template +struct basic_sender; + +template +struct basic_sender<_Data, _Sndr> +{ + using sender_concept = __async::sender_t; + using __tag_t = typename _Data::sender_tag; + using __rcvr_t = typename _Data::receiver_tag; + + _CCCL_NO_UNIQUE_ADDRESS __tag_t __tag_; + _Data __data_; + _Sndr __sndr_; + + // Connect the sender to the receiver (the continuation) and + // return the state_type object for this operation. + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) && + { + return __make_opstate(static_cast(*this), static_cast<_Rcvr&&>(__rcvr)); + } + + template + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) const& + { + return __make_opstate(*this, static_cast<_Rcvr&&>(__rcvr)); + } + + _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept + { + return __async::__get_attrs(0, __data_, __sndr_); + } +}; + +template +basic_sender(__ignore, _Data, _Sndrs...) -> basic_sender<_Data, _Sndrs...>; + +} // namespace cuda::experimental::__async + +#include + +#endif diff --git a/cudax/include/cuda/experimental/__async/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/completion_signatures.cuh new file mode 100644 index 0000000000..c4edf4b618 --- /dev/null +++ b/cudax/include/cuda/experimental/__async/completion_signatures.cuh @@ -0,0 +1,336 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES +#define __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +#include + +namespace cuda::experimental::__async +{ +// A typelist for completion signatures +template +struct completion_signatures +{}; + +// A metafunction to determine if a type is a completion signature +template +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = false; + +template +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = true; + +template +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = true; + +template <> +_CCCL_INLINE_VAR constexpr bool __is_valid_signature = true; + +// The implementation of transform_completion_signatures starts here +template class _Vy, template class _Ey, class _Sy> +extern __undefined<_Sig> __transform_sig; + +template class _Vy, template class _Ey, class _Sy> +extern __fn_t<_Vy<_Values...>>* __transform_sig; + +template class _Vy, template class _Ey, class _Sy> +extern __fn_t<_Ey<_Error>>* __transform_sig; + +template