From fe27d99255f43eacef77ccf9d308234d3532eafd Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 5 Aug 2024 16:27:15 +0200 Subject: [PATCH 01/33] Fix the `clang-format` path in the devcotnainers (#2194) In the devcontainers `clang-format` is now installed into `/usr/bin/clang-format` --- .devcontainer/cuda11.1-gcc6/devcontainer.json | 2 +- .devcontainer/cuda11.1-gcc7/devcontainer.json | 2 +- .devcontainer/cuda11.1-gcc8/devcontainer.json | 2 +- .devcontainer/cuda11.1-gcc9/devcontainer.json | 2 +- .devcontainer/cuda11.1-llvm9/devcontainer.json | 2 +- .devcontainer/cuda11.8-gcc11/devcontainer.json | 2 +- .devcontainer/cuda12.0-gcc10/devcontainer.json | 2 +- .devcontainer/cuda12.0-gcc11/devcontainer.json | 2 +- .devcontainer/cuda12.0-gcc12/devcontainer.json | 2 +- .devcontainer/cuda12.0-gcc9/devcontainer.json | 2 +- .devcontainer/cuda12.0-llvm10/devcontainer.json | 2 +- .devcontainer/cuda12.0-llvm11/devcontainer.json | 2 +- .devcontainer/cuda12.0-llvm12/devcontainer.json | 2 +- .devcontainer/cuda12.0-llvm13/devcontainer.json | 2 +- .devcontainer/cuda12.0-llvm14/devcontainer.json | 2 +- .devcontainer/cuda12.0-llvm9/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc10/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc11/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc12/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc13/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc7/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc8/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc9/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm10/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm11/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm12/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm13/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm14/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm15/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm16/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm17/devcontainer.json | 2 +- .devcontainer/cuda12.5-llvm9/devcontainer.json | 2 +- .devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json | 2 +- .devcontainer/devcontainer.json | 2 +- 34 files changed, 34 insertions(+), 34 deletions(-) diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json index ed345016ec..cd810c13dd 100644 --- a/.devcontainer/cuda11.1-gcc6/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json index b1ff078547..9db4454383 100644 --- a/.devcontainer/cuda11.1-gcc7/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json index f480d0003a..143b42abdf 100644 --- a/.devcontainer/cuda11.1-gcc8/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json index a622e14519..e5aaa70339 100644 --- a/.devcontainer/cuda11.1-gcc9/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json index 3eaa29a8b8..ccf1bd9a81 100644 --- a/.devcontainer/cuda11.1-llvm9/devcontainer.json +++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json index 4d03dc2de0..e8d2c3d94e 100644 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json index 1371a181a9..9c1c1c3328 100644 --- a/.devcontainer/cuda12.0-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json index 2096821c11..c86d5cba2d 100644 --- a/.devcontainer/cuda12.0-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json index e99c8debae..af192d3938 100644 --- a/.devcontainer/cuda12.0-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json index 3154808232..434b1b69f2 100644 --- a/.devcontainer/cuda12.0-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json index b4bf89b341..15f4c622f9 100644 --- a/.devcontainer/cuda12.0-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json index b87d457cb7..f3bc6a558a 100644 --- a/.devcontainer/cuda12.0-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json index 829ec1cb2e..032c783fef 100644 --- a/.devcontainer/cuda12.0-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json index 60abc033be..eb700e0615 100644 --- a/.devcontainer/cuda12.0-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json index a48b0bcd0c..935275ed5b 100644 --- a/.devcontainer/cuda12.0-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json index 465478e431..7127d6c2db 100644 --- a/.devcontainer/cuda12.0-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json index 5a59153bf3..b16f5b5d4d 100644 --- a/.devcontainer/cuda12.5-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc10/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.5-gcc11/devcontainer.json index 42b668abf1..c3c5ca3199 100644 --- a/.devcontainer/cuda12.5-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc11/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json index d807d4cd30..f3996dac8e 100644 --- a/.devcontainer/cuda12.5-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json index 01364fdbc2..74031d3657 100644 --- a/.devcontainer/cuda12.5-gcc13/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.5-gcc7/devcontainer.json index a632769505..88f0060a87 100644 --- a/.devcontainer/cuda12.5-gcc7/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc7/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.5-gcc8/devcontainer.json index f0aff7ba7b..9f8b6020c5 100644 --- a/.devcontainer/cuda12.5-gcc8/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc8/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.5-gcc9/devcontainer.json index e050d23303..422a20c62b 100644 --- a/.devcontainer/cuda12.5-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc9/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.5-llvm10/devcontainer.json index 0cda7b0a66..028509f6ef 100644 --- a/.devcontainer/cuda12.5-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm10/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.5-llvm11/devcontainer.json index 1a513873f1..5f4d3f4c1d 100644 --- a/.devcontainer/cuda12.5-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm11/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.5-llvm12/devcontainer.json index a11a351e30..2b9ecc320b 100644 --- a/.devcontainer/cuda12.5-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm12/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.5-llvm13/devcontainer.json index 0136655f0c..933ad59af7 100644 --- a/.devcontainer/cuda12.5-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm13/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.5-llvm14/devcontainer.json index dd9d6a62f0..72e7e0275d 100644 --- a/.devcontainer/cuda12.5-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm14/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.5-llvm15/devcontainer.json index 51fd6a1466..c3086986e9 100644 --- a/.devcontainer/cuda12.5-llvm15/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm15/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.5-llvm16/devcontainer.json index 882025ddaf..2db6386576 100644 --- a/.devcontainer/cuda12.5-llvm16/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm16/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.5-llvm17/devcontainer.json index 55fa86ff53..44fb4cbec7 100644 --- a/.devcontainer/cuda12.5-llvm17/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm17/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.5-llvm9/devcontainer.json index 3b2a328c2e..f95daf28f0 100644 --- a/.devcontainer/cuda12.5-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm9/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json index 5e4b04e19b..92e692ae14 100644 --- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json +++ b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 01364fdbc2..74031d3657 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -38,7 +38,7 @@ "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, - "clang-format.executable": "/usr/local/bin/clang-format", + "clang-format.executable": "/usr/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], From d1e7c1cc82df61d3a59569c8995fbc652c3f1f7c Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Mon, 5 Aug 2024 11:11:29 -0700 Subject: [PATCH 02/33] Mount a build directory for CCCL projects if WSL is detected (#2035) Co-authored-by: Michael Schellenberger Costa --- .devcontainer/cuda11.1-gcc6/devcontainer.json | 6 ++++-- .devcontainer/cuda11.1-gcc7/devcontainer.json | 6 ++++-- .devcontainer/cuda11.1-gcc8/devcontainer.json | 6 ++++-- .devcontainer/cuda11.1-gcc9/devcontainer.json | 6 ++++-- .devcontainer/cuda11.1-llvm9/devcontainer.json | 6 ++++-- .devcontainer/cuda11.8-gcc11/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-gcc10/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-gcc11/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-gcc12/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-gcc9/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-llvm10/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-llvm11/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-llvm12/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-llvm13/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-llvm14/devcontainer.json | 6 ++++-- .devcontainer/cuda12.0-llvm9/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-gcc10/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-gcc11/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-gcc12/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-gcc13/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-gcc7/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-gcc8/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-gcc9/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm10/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm11/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm12/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm13/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm14/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm15/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm16/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm17/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-llvm9/devcontainer.json | 6 ++++-- .devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json | 6 ++++-- .devcontainer/devcontainer.json | 6 ++++-- 34 files changed, 136 insertions(+), 68 deletions(-) diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json index cd810c13dd..401a33ba59 100644 --- a/.devcontainer/cuda11.1-gcc6/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json index 9db4454383..5298b39143 100644 --- a/.devcontainer/cuda11.1-gcc7/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json index 143b42abdf..27ca3c28a0 100644 --- a/.devcontainer/cuda11.1-gcc8/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json index e5aaa70339..ff592b79f5 100644 --- a/.devcontainer/cuda11.1-gcc9/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json index ccf1bd9a81..e8a167e1c9 100644 --- a/.devcontainer/cuda11.1-llvm9/devcontainer.json +++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json index e8d2c3d94e..cdb8a4250b 100644 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json index 9c1c1c3328..4d081b3125 100644 --- a/.devcontainer/cuda12.0-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json index c86d5cba2d..4dd297c412 100644 --- a/.devcontainer/cuda12.0-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json index af192d3938..660e98109f 100644 --- a/.devcontainer/cuda12.0-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json index 434b1b69f2..1f781d5852 100644 --- a/.devcontainer/cuda12.0-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json index 15f4c622f9..e716b5eb6c 100644 --- a/.devcontainer/cuda12.0-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json index f3bc6a558a..399b306075 100644 --- a/.devcontainer/cuda12.0-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json index 032c783fef..d977c15ff3 100644 --- a/.devcontainer/cuda12.0-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json index eb700e0615..016695d62c 100644 --- a/.devcontainer/cuda12.0-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json index 935275ed5b..1fc144da6f 100644 --- a/.devcontainer/cuda12.0-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json index 7127d6c2db..8bd0756dd4 100644 --- a/.devcontainer/cuda12.0-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json index b16f5b5d4d..61459a25fc 100644 --- a/.devcontainer/cuda12.5-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc10/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.5-gcc11/devcontainer.json index c3c5ca3199..184de8734c 100644 --- a/.devcontainer/cuda12.5-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc11/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json index f3996dac8e..1d16b6aa61 100644 --- a/.devcontainer/cuda12.5-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json index 74031d3657..0f3fbb36f5 100644 --- a/.devcontainer/cuda12.5-gcc13/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.5-gcc7/devcontainer.json index 88f0060a87..9d5d356ad5 100644 --- a/.devcontainer/cuda12.5-gcc7/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc7/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.5-gcc8/devcontainer.json index 9f8b6020c5..10b44d31f1 100644 --- a/.devcontainer/cuda12.5-gcc8/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc8/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.5-gcc9/devcontainer.json index 422a20c62b..333c11b3cc 100644 --- a/.devcontainer/cuda12.5-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc9/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.5-llvm10/devcontainer.json index 028509f6ef..8e3e19d4fc 100644 --- a/.devcontainer/cuda12.5-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm10/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.5-llvm11/devcontainer.json index 5f4d3f4c1d..a216720e5d 100644 --- a/.devcontainer/cuda12.5-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm11/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.5-llvm12/devcontainer.json index 2b9ecc320b..e1cbc4ecb7 100644 --- a/.devcontainer/cuda12.5-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm12/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.5-llvm13/devcontainer.json index 933ad59af7..6fbbf56b79 100644 --- a/.devcontainer/cuda12.5-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm13/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.5-llvm14/devcontainer.json index 72e7e0275d..b8528e989f 100644 --- a/.devcontainer/cuda12.5-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm14/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.5-llvm15/devcontainer.json index c3086986e9..768d3163ee 100644 --- a/.devcontainer/cuda12.5-llvm15/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm15/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.5-llvm16/devcontainer.json index 2db6386576..8ba700fa4e 100644 --- a/.devcontainer/cuda12.5-llvm16/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm16/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.5-llvm17/devcontainer.json index 44fb4cbec7..0de5689fdc 100644 --- a/.devcontainer/cuda12.5-llvm17/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm17/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.5-llvm9/devcontainer.json index f95daf28f0..d34ae01844 100644 --- a/.devcontainer/cuda12.5-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm9/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json index 92e692ae14..a530527cac 100644 --- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json +++ b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 74031d3657..0f3fbb36f5 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,7 +7,8 @@ "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", + "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", @@ -25,7 +26,8 @@ "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=cccl-build,target=/home/coder/cccl/build" ], "customizations": { "vscode": { From 75929cb688b244c2089a4c2ddf1b406d086c2ad9 Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Mon, 5 Aug 2024 23:51:34 -0700 Subject: [PATCH 03/33] 2118 [CUDAX] Change the RAII device swapper to use driver API and add it in places where it was missing (#2192) * Change __scoped_device to use driver API * Switch to use driver API based dev setter * Remove constexpr from operator device() * Fix comments and includes * Fallback to non-versioned get entry point pre 12.5 We need to use versioned version to get correct cuStreamGetCtx. There is v2 version of it in 12.5, fortunatelly the versioned get entry point is available there too * Fix unused local variable * Fix warnings in ensure_current_device test * Move ensure current device out of detail * Add LIBCUDACXX_ENABLE_EXCEPTIONS to tests cmake --- .../cuda/experimental/__device/device.cuh | 32 ++++- .../cuda/experimental/__device/device_ref.cuh | 64 --------- .../cuda/experimental/__event/event.cuh | 10 +- .../cuda/experimental/__event/event_ref.cuh | 5 +- .../cuda/experimental/__event/timed_event.cuh | 2 +- .../cuda/experimental/__launch/launch.cuh | 7 + .../cuda/experimental/__stream/stream.cuh | 17 ++- .../experimental/__utility/driver_api.cuh | 70 ++++++++- .../__utility/ensure_current_device.cuh | 80 +++++++++++ cudax/test/CMakeLists.txt | 2 + cudax/test/common/utility.cuh | 28 ++++ cudax/test/device/device_smoke.cu | 7 +- cudax/test/launch/configuration.cu | 1 - cudax/test/launch/launch_smoke.cu | 1 - cudax/test/stream/get_stream.cu | 1 - cudax/test/stream/stream_smoke.cu | 1 - cudax/test/utility/driver_api.cu | 49 +++++-- cudax/test/utility/ensure_current_device.cu | 135 ++++++++++++++++++ 18 files changed, 412 insertions(+), 100 deletions(-) create mode 100644 cudax/include/cuda/experimental/__utility/ensure_current_device.cuh create mode 100644 cudax/test/utility/ensure_current_device.cu diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh index f91b0089d5..35e0cfe2d4 100644 --- a/cudax/include/cuda/experimental/__device/device.cuh +++ b/cudax/include/cuda/experimental/__device/device.cuh @@ -21,7 +21,13 @@ # pragma system_header #endif // no system header +#include + #include +#include + +#include +#include namespace cuda::experimental { @@ -33,7 +39,7 @@ struct __emplace_device { int __id_; - _CCCL_NODISCARD constexpr operator device() const noexcept; + _CCCL_NODISCARD operator device() const noexcept; _CCCL_NODISCARD constexpr const __emplace_device* operator->() const noexcept; }; @@ -56,6 +62,24 @@ public: # endif #endif + CUcontext primary_context() const + { + ::std::call_once(__init_once, [this]() { + __device = detail::driver::deviceGet(__id_); + __primary_ctx = detail::driver::primaryCtxRetain(__device); + }); + assert(__primary_ctx != nullptr); + return __primary_ctx; + } + + ~device() + { + if (__primary_ctx) + { + detail::driver::primaryCtxRelease(__device); + } + } + private: // TODO: put a mutable thread-safe (or thread_local) cache of device // properties here. @@ -63,6 +87,10 @@ private: friend class device_ref; friend struct detail::__emplace_device; + mutable CUcontext __primary_ctx = nullptr; + mutable CUdevice __device{}; + mutable ::std::once_flag __init_once; + explicit constexpr device(int __id) noexcept : device_ref(__id) {} @@ -76,7 +104,7 @@ private: namespace detail { -_CCCL_NODISCARD inline constexpr __emplace_device::operator device() const noexcept +_CCCL_NODISCARD inline __emplace_device::operator device() const noexcept { return device(__id_); } diff --git a/cudax/include/cuda/experimental/__device/device_ref.cuh b/cudax/include/cuda/experimental/__device/device_ref.cuh index f5945914da..7f2635611f 100644 --- a/cudax/include/cuda/experimental/__device/device_ref.cuh +++ b/cudax/include/cuda/experimental/__device/device_ref.cuh @@ -22,7 +22,6 @@ #endif // no system header #include -#include namespace cuda::experimental { @@ -103,69 +102,6 @@ public: } }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -//! @brief RAII helper which saves the current device and switches to the -//! specified device on construction and switches to the saved device on -//! destruction. -//! -struct __scoped_device -{ -private: - // The original device ordinal, or -1 if the device was not changed. - int const __old_device; - - //! @brief Returns the current device ordinal. - //! - //! @throws cuda_error if the device query fails. - static int __current_device() - { - int device = -1; - _CCCL_TRY_CUDA_API(cudaGetDevice, "failed to get the current device", &device); - return device; - } - - explicit __scoped_device(int new_device, int old_device) noexcept - : __old_device(new_device == old_device ? -1 : old_device) - {} - -public: - //! @brief Construct a new `__scoped_device` object and switch to the specified - //! device. - //! - //! @param new_device The device to switch to - //! - //! @throws cuda_error if the device switch fails - explicit __scoped_device(device_ref new_device) - : __scoped_device(new_device.get(), __current_device()) - { - if (__old_device != -1) - { - _CCCL_TRY_CUDA_API(cudaSetDevice, "failed to set the current device", new_device.get()); - } - } - - __scoped_device(__scoped_device&&) = delete; - __scoped_device(__scoped_device const&) = delete; - __scoped_device& operator=(__scoped_device&&) = delete; - __scoped_device& operator=(__scoped_device const&) = delete; - - //! @brief Destroy the `__scoped_device` object and switch back to the original - //! device. - //! - //! @throws cuda_error if the device switch fails. If the destructor is called - //! during stack unwinding, the program is automatically terminated. - ~__scoped_device() noexcept(false) - { - if (__old_device != -1) - { - _CCCL_TRY_CUDA_API(cudaSetDevice, "failed to restore the current device", __old_device); - } - } -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - } // namespace cuda::experimental #endif // _CUDAX__DEVICE_DEVICE_REF diff --git a/cudax/include/cuda/experimental/__event/event.cuh b/cudax/include/cuda/experimental/__event/event.cuh index 0b6b7802b2..3ce997c55c 100644 --- a/cudax/include/cuda/experimental/__event/event.cuh +++ b/cudax/include/cuda/experimental/__event/event.cuh @@ -30,6 +30,7 @@ #include #include +#include namespace cuda::experimental { @@ -54,7 +55,7 @@ public: //! //! @throws cuda_error if the event creation fails. explicit event(stream_ref __stream, flags __flags = flags::none) - : event(static_cast(__flags) | cudaEventDisableTiming) + : event(__stream, static_cast(__flags) | cudaEventDisableTiming) { record(__stream); } @@ -85,7 +86,9 @@ public: { if (__event_ != nullptr) { - [[maybe_unused]] auto __status = ::cudaEventDestroy(__event_); + // Needs to call driver API in case current device is not set, runtime version would set dev 0 current + // Alternative would be to store the device and push/pop here + [[maybe_unused]] auto __status = detail::driver::eventDestroy(__event_); } } @@ -144,9 +147,10 @@ private: : event_ref(__evnt) {} - explicit event(unsigned int __flags) + explicit event(stream_ref __stream, unsigned int __flags) : event_ref(::cudaEvent_t{}) { + [[maybe_unused]] __ensure_current_device __dev_setter(__stream); _CCCL_TRY_CUDA_API( ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast(__flags)); } diff --git a/cudax/include/cuda/experimental/__event/event_ref.cuh b/cudax/include/cuda/experimental/__event/event_ref.cuh index b795d46a77..3b0ccc6dbc 100644 --- a/cudax/include/cuda/experimental/__event/event_ref.cuh +++ b/cudax/include/cuda/experimental/__event/event_ref.cuh @@ -30,6 +30,8 @@ #include #include +#include + namespace cuda::experimental { class event; @@ -74,7 +76,8 @@ public: { assert(__event_ != nullptr); assert(__stream.get() != nullptr); - _CCCL_TRY_CUDA_API(::cudaEventRecord, "Failed to record CUDA event", __event_, __stream.get()); + // Need to use driver API, cudaEventRecord will push dev 0 if stack is empty + detail::driver::eventRecord(__event_, __stream.get()); } //! @brief Waits until all the work in the stream prior to the record of the diff --git a/cudax/include/cuda/experimental/__event/timed_event.cuh b/cudax/include/cuda/experimental/__event/timed_event.cuh index debcbcd26e..48b9b0f1a5 100644 --- a/cudax/include/cuda/experimental/__event/timed_event.cuh +++ b/cudax/include/cuda/experimental/__event/timed_event.cuh @@ -42,7 +42,7 @@ public: //! //! @throws cuda_error if the event creation fails. explicit timed_event(stream_ref __stream, flags __flags = flags::none) - : event(static_cast(__flags)) + : event(__stream, static_cast(__flags)) { record(__stream); } diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh index 790af2a9d5..1a49cafa40 100644 --- a/cudax/include/cuda/experimental/__launch/launch.cuh +++ b/cudax/include/cuda/experimental/__launch/launch.cuh @@ -16,6 +16,7 @@ #include #include +#include #if _CCCL_STD_VER >= 2017 namespace cuda::experimental @@ -119,6 +120,7 @@ template & conf, const Kernel& kernel, Args... args) { + [[maybe_unused]] __ensure_current_device __dev_setter(stream); cudaError_t status; if constexpr (::cuda::std::is_invocable_v, Args...>) { @@ -181,6 +183,7 @@ void launch( template void launch(::cuda::stream_ref stream, const hierarchy_dimensions& dims, const Kernel& kernel, Args... args) { + [[maybe_unused]] __ensure_current_device __dev_setter(stream); cudaError_t status; if constexpr (::cuda::std::is_invocable_v, Args...>) { @@ -245,6 +248,7 @@ void launch(::cuda::stream_ref stream, void (*kernel)(kernel_config, ExpArgs...), ActArgs&&... args) { + [[maybe_unused]] __ensure_current_device __dev_setter(stream); cudaError_t status = [&](ExpArgs... args) { return detail::launch_impl(stream, conf, kernel, conf, args...); }(std::forward(args)...); @@ -299,6 +303,7 @@ void launch(::cuda::stream_ref stream, void (*kernel)(hierarchy_dimensions, ExpArgs...), ActArgs&&... args) { + [[maybe_unused]] __ensure_current_device __dev_setter(stream); cudaError_t status = [&](ExpArgs... args) { return detail::launch_impl(stream, kernel_config(dims), kernel, dims, args...); }(std::forward(args)...); @@ -354,6 +359,7 @@ void launch(::cuda::stream_ref stream, void (*kernel)(ExpArgs...), ActArgs&&... args) { + [[maybe_unused]] __ensure_current_device __dev_setter(stream); cudaError_t status = [&](ExpArgs... args) { return detail::launch_impl(stream, conf, kernel, args...); }(std::forward(args)...); @@ -406,6 +412,7 @@ template void launch( ::cuda::stream_ref stream, const hierarchy_dimensions& dims, void (*kernel)(ExpArgs...), ActArgs&&... args) { + [[maybe_unused]] __ensure_current_device __dev_setter(stream); cudaError_t status = [&](ExpArgs... args) { return detail::launch_impl(stream, kernel_config(dims), kernel, args...); }(std::forward(args)...); diff --git a/cudax/include/cuda/experimental/__stream/stream.cuh b/cudax/include/cuda/experimental/__stream/stream.cuh index 4859e9fabc..0ba125269b 100644 --- a/cudax/include/cuda/experimental/__stream/stream.cuh +++ b/cudax/include/cuda/experimental/__stream/stream.cuh @@ -27,6 +27,7 @@ #include #include +#include namespace cuda::experimental { @@ -51,7 +52,7 @@ struct stream : stream_ref //! @throws cuda_error if stream creation fails explicit stream(device_ref __dev, int __priority = default_priority) { - __scoped_device dev_setter(__dev); + [[maybe_unused]] __ensure_current_device __dev_setter(__dev); _CCCL_TRY_CUDA_API( ::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamDefault, __priority); } @@ -89,7 +90,9 @@ struct stream : stream_ref { if (__stream != detail::invalid_stream) { - [[maybe_unused]] auto status = ::cudaStreamDestroy(__stream); + // Needs to call driver API in case current device is not set, runtime version would set dev 0 current + // Alternative would be to store the device and push/pop here + [[maybe_unused]] auto status = detail::driver::streamDestroy(__stream); } } @@ -139,18 +142,20 @@ struct stream : stream_ref void wait(event_ref __ev) const { assert(__ev.get() != nullptr); - _CCCL_TRY_CUDA_API(::cudaStreamWaitEvent, "Failed to make a stream wait for an event", get(), __ev.get()); + // Need to use driver API, cudaStreamWaitEvent would push dev 0 if stack was empty + detail::driver::streamWaitEvent(get(), __ev.get()); } - //! @brief Make all future work submitted into this stream depend on completion of all work from the specified stream + //! @brief Make all future work submitted into this stream depend on completion of all work from the specified + //! stream //! //! @param __other Stream that this stream should wait for //! //! @throws cuda_error if inserting the dependency fails void wait(stream_ref __other) const { - // TODO consider an optimization to not create an event every time and instead have one persistent event or one per - // stream + // TODO consider an optimization to not create an event every time and instead have one persistent event or one + // per stream assert(__stream != detail::invalid_stream); event __tmp(__other); wait(__tmp); diff --git a/cudax/include/cuda/experimental/__utility/driver_api.cuh b/cudax/include/cuda/experimental/__utility/driver_api.cuh index 21b8c4d742..8a52dd89fc 100644 --- a/cudax/include/cuda/experimental/__utility/driver_api.cuh +++ b/cudax/include/cuda/experimental/__utility/driver_api.cuh @@ -25,7 +25,13 @@ inline void* get_driver_entry_point(const char* name) { void* fn; cudaDriverEntryPointQueryResult result; +#if CUDART_VERSION >= 12050 + // For minor version compatibility request the 12.0 version of everything for now + cudaGetDriverEntryPointByVersion(name, &fn, 12000, cudaEnableDefault, &result); +#else + // Versioned get entry point not available before 12.5, but we don't need anything versioned before that cudaGetDriverEntryPoint(name, &fn, cudaEnableDefault, &result); +#endif if (result != cudaDriverEntryPointSuccess) { if (result == cudaDriverEntryPointVersionNotSufficent) @@ -56,11 +62,12 @@ inline void ctxPush(CUcontext ctx) call_driver_fn(driver_fn, "Failed to push context", ctx); } -inline void ctxPop() +inline CUcontext ctxPop() { static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuCtxPopCurrent); - CUcontext dummy; - call_driver_fn(driver_fn, "Failed to pop context", &dummy); + CUcontext result; + call_driver_fn(driver_fn, "Failed to pop context", &result); + return result; } inline CUcontext ctxGetCurrent() @@ -71,6 +78,38 @@ inline CUcontext ctxGetCurrent() return result; } +inline CUdevice deviceGet(int ordinal) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDeviceGet); + CUdevice result; + call_driver_fn(driver_fn, "Failed to get device", &result, ordinal); + return result; +} + +inline CUcontext primaryCtxRetain(CUdevice dev) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRetain); + CUcontext result; + call_driver_fn(driver_fn, "Failed to retain context for a device", &result, dev); + return result; +} + +inline void primaryCtxRelease(CUdevice dev) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRelease); + // TODO we might need to ignore failure here + call_driver_fn(driver_fn, "Failed to release context for a device", dev); +} + +inline bool isPrimaryCtxActive(CUdevice dev) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxGetState); + int result; + unsigned int dummy; + call_driver_fn(driver_fn, "Failed to check the primary ctx state", dev, &dummy, &result); + return result == 1; +} + inline CUcontext streamGetCtx(CUstream stream) { static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuStreamGetCtx); @@ -78,6 +117,31 @@ inline CUcontext streamGetCtx(CUstream stream) call_driver_fn(driver_fn, "Failed to get context from a stream", stream, &result); return result; } + +inline void streamWaitEvent(CUstream stream, CUevent event) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuStreamWaitEvent); + call_driver_fn(driver_fn, "Failed to make a stream wait for an event", stream, event, CU_EVENT_WAIT_DEFAULT); +} + +inline void eventRecord(CUevent event, CUstream stream) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuEventRecord); + call_driver_fn(driver_fn, "Failed to record CUDA event", event, stream); +} + +// Destroy calls return error codes to let the calling code decide if the error should be ignored +inline cudaError_t streamDestroy(CUstream stream) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuStreamDestroy); + return static_cast(driver_fn(stream)); +} + +inline cudaError_t eventDestroy(CUevent event) +{ + static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuEventDestroy); + return static_cast(driver_fn(event)); +} } // namespace cuda::experimental::detail::driver #undef CUDAX_GET_DRIVER_FUNCTION diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh new file mode 100644 index 0000000000..2431d02818 --- /dev/null +++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE +#define _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +namespace cuda::experimental +{ +//! @brief RAII helper which on construction sets the current device to the specified one or one a +//! stream was created under. It sets the state back on destruction. +//! +struct __ensure_current_device +{ + //! @brief Construct a new `__ensure_current_device` object and switch to the specified + //! device. + //! + //! @param new_device The device to switch to + //! + //! @throws cuda_error if the device switch fails + explicit __ensure_current_device(device_ref new_device) + { + auto ctx = devices[new_device.get()].primary_context(); + detail::driver::ctxPush(ctx); + } + + //! @brief Construct a new `__ensure_current_device` object and switch to the device + //! under which the specified stream was created. + //! + //! @param stream Stream indicating the device to switch to + //! + //! @throws cuda_error if the device switch fails + explicit __ensure_current_device(stream_ref stream) + { + auto ctx = detail::driver::streamGetCtx(stream.get()); + detail::driver::ctxPush(ctx); + } + + __ensure_current_device(__ensure_current_device&&) = delete; + __ensure_current_device(__ensure_current_device const&) = delete; + __ensure_current_device& operator=(__ensure_current_device&&) = delete; + __ensure_current_device& operator=(__ensure_current_device const&) = delete; + + //! @brief Destroy the `__ensure_current_device` object and switch back to the original + //! device. + //! + //! @throws cuda_error if the device switch fails. If the destructor is called + //! during stack unwinding, the program is automatically terminated. + ~__ensure_current_device() noexcept(false) + { + // TODO would it make sense to assert here that we pushed and popped the same thing? + detail::driver::ctxPop(); + } +}; +} // namespace cuda::experimental +#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index bb8a7d7c54..4752f8b964 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -29,6 +29,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test target_link_libraries(${test_target} PRIVATE ${cn_target} Catch2::Catch2 catch2_main) target_link_libraries(${test_target} PRIVATE ${cn_target} cudax::Thrust) target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE") + target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXCEPTIONS") target_compile_options(${test_target} PRIVATE $<$:--extended-lambda>) cudax_clone_target_properties(${test_target} ${cn_target}) set_target_properties(${test_target} PROPERTIES @@ -80,6 +81,7 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_catch2_test(test_target misc_tests ${cn_target} utility/driver_api.cu + utility/ensure_current_device.cu ) cudax_add_catch2_test(test_target containers ${cn_target} diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh index 2d7254c069..64a54e1b48 100644 --- a/cudax/test/common/utility.cuh +++ b/cudax/test/common/utility.cuh @@ -137,6 +137,11 @@ struct spin_until_80 } }; +struct empty_kernel +{ + __device__ void operator()() const noexcept {} +}; + /// A kernel that takes a callable object and invokes it with a set of arguments template __global__ void invokernel(Fn fn, Args... args) @@ -144,5 +149,28 @@ __global__ void invokernel(Fn fn, Args... args) fn(args...); } +inline int count_driver_stack() +{ + if (cudax::detail::driver::ctxGetCurrent() != nullptr) + { + auto ctx = cudax::detail::driver::ctxPop(); + auto result = 1 + count_driver_stack(); + cudax::detail::driver::ctxPush(ctx); + return result; + } + else + { + return 0; + } +} + +inline void empty_driver_stack() +{ + while (cudax::detail::driver::ctxGetCurrent() != nullptr) + { + cudax::detail::driver::ctxPop(); + } +} + } // namespace test } // namespace diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu index 86c9625e21..6f772de08a 100644 --- a/cudax/test/device/device_smoke.cu +++ b/cudax/test/device/device_smoke.cu @@ -8,7 +8,6 @@ // //===----------------------------------------------------------------------===// -#define LIBCUDACXX_ENABLE_EXCEPTIONS #include #include "../hierarchy/testing_common.cuh" @@ -260,9 +259,9 @@ TEST_CASE("global devices vector", "[device]") CUDAX_REQUIRE(1 == std::next(cudax::devices.begin())->get()); CUDAX_REQUIRE(1 == cudax::devices.begin()[1].get()); - CUDAX_REQUIRE(0 == (*std::prev(cudax::devices.end())).get()); - CUDAX_REQUIRE(0 == std::prev(cudax::devices.end())->get()); - CUDAX_REQUIRE(0 == cudax::devices.end()[-1].get()); + CUDAX_REQUIRE(cudax::devices.size() - 1 == (*std::prev(cudax::devices.end())).get()); + CUDAX_REQUIRE(cudax::devices.size() - 1 == std::prev(cudax::devices.end())->get()); + CUDAX_REQUIRE(cudax::devices.size() - 1 == cudax::devices.end()[-1].get()); } try diff --git a/cudax/test/launch/configuration.cu b/cudax/test/launch/configuration.cu index a47eea2590..9e7f98df1b 100644 --- a/cudax/test/launch/configuration.cu +++ b/cudax/test/launch/configuration.cu @@ -8,7 +8,6 @@ // //===----------------------------------------------------------------------===// -#define LIBCUDACXX_ENABLE_EXCEPTIONS // Test translation of launch function arguments to cudaLaunchConfig_t sent to cudaLaunchKernelEx internally // We replace cudaLaunchKernelEx with a test function here through a macro to intercept the cudaLaunchConfig_t #define cudaLaunchKernelEx cudaLaunchKernelExTestReplacement diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index 554cabd015..810e65c390 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -7,7 +7,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#define LIBCUDACXX_ENABLE_EXCEPTIONS #include #include diff --git a/cudax/test/stream/get_stream.cu b/cudax/test/stream/get_stream.cu index 277a10246a..0654c3be39 100644 --- a/cudax/test/stream/get_stream.cu +++ b/cudax/test/stream/get_stream.cu @@ -8,7 +8,6 @@ // //===----------------------------------------------------------------------===// -#define LIBCUDACXX_ENABLE_EXCEPTIONS #include #include "../common/utility.cuh" diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu index e6b86ccf16..cbee352080 100644 --- a/cudax/test/stream/stream_smoke.cu +++ b/cudax/test/stream/stream_smoke.cu @@ -8,7 +8,6 @@ // //===----------------------------------------------------------------------===// -#define LIBCUDACXX_ENABLE_EXCEPTIONS #include #include diff --git a/cudax/test/utility/driver_api.cu b/cudax/test/utility/driver_api.cu index 513d6476eb..e5fd64d14f 100644 --- a/cudax/test/utility/driver_api.cu +++ b/cudax/test/utility/driver_api.cu @@ -7,14 +7,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#define LIBCUDACXX_ENABLE_EXCEPTIONS #include #include "../hierarchy/testing_common.cuh" -TEST_CASE("Call each one", "[driver api]") +TEST_CASE("Call each driver api", "[utility]") { + namespace driver = cuda::experimental::detail::driver; cudaStream_t stream; // Assumes the ctx stack was empty or had one ctx, should be the case unless some other // test leaves 2+ ctxs on the stack @@ -22,23 +22,48 @@ TEST_CASE("Call each one", "[driver api]") // Pushes the primary context if the stack is empty CUDART(cudaStreamCreate(&stream)); - auto ctx = cuda::experimental::detail::driver::ctxGetCurrent(); + auto ctx = driver::ctxGetCurrent(); CUDAX_REQUIRE(ctx != nullptr); - cuda::experimental::detail::driver::ctxPop(); - CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == nullptr); + // Confirm pop will leave the stack empty + driver::ctxPop(); + CUDAX_REQUIRE(driver::ctxGetCurrent() == nullptr); - cuda::experimental::detail::driver::ctxPush(ctx); - CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == ctx); + // Confirm we can push multiple times + driver::ctxPush(ctx); + CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx); - cuda::experimental::detail::driver::ctxPush(ctx); - CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == ctx); + driver::ctxPush(ctx); + CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx); - cuda::experimental::detail::driver::ctxPop(); - CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == ctx); + driver::ctxPop(); + CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx); - auto stream_ctx = cuda::experimental::detail::driver::streamGetCtx(stream); + // Confirm stream ctx match + auto stream_ctx = driver::streamGetCtx(stream); CUDAX_REQUIRE(ctx == stream_ctx); CUDART(cudaStreamDestroy(stream)); + + CUDAX_REQUIRE(driver::deviceGet(0) == 0); + + // Confirm we can retain the primary ctx that cudart retained first + auto primary_ctx = driver::primaryCtxRetain(0); + CUDAX_REQUIRE(ctx == primary_ctx); + + driver::ctxPop(); + CUDAX_REQUIRE(driver::ctxGetCurrent() == nullptr); + + CUDAX_REQUIRE(driver::isPrimaryCtxActive(0)); + // Confirm we can reset the primary context with double release + driver::primaryCtxRelease(0); + driver::primaryCtxRelease(0); + + CUDAX_REQUIRE(!driver::isPrimaryCtxActive(0)); + + // Confirm cudart can recover + CUDART(cudaStreamCreate(&stream)); + CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx); + + CUDART(driver::streamDestroy(stream)); } diff --git a/cudax/test/utility/ensure_current_device.cu b/cudax/test/utility/ensure_current_device.cu new file mode 100644 index 0000000000..89efc7d4f6 --- /dev/null +++ b/cudax/test/utility/ensure_current_device.cu @@ -0,0 +1,135 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "../common/utility.cuh" + +namespace driver = cuda::experimental::detail::driver; + +void recursive_check_device_setter(int id) +{ + int cudart_id; + cudax::__ensure_current_device setter(cudax::device_ref{id}); + CUDAX_REQUIRE(test::count_driver_stack() == cudax::devices.size() - id); + auto ctx = driver::ctxGetCurrent(); + CUDART(cudaGetDevice(&cudart_id)); + CUDAX_REQUIRE(cudart_id == id); + + if (id != 0) + { + recursive_check_device_setter(id - 1); + + CUDAX_REQUIRE(test::count_driver_stack() == cudax::devices.size() - id); + CUDAX_REQUIRE(ctx == driver::ctxGetCurrent()); + CUDART(cudaGetDevice(&cudart_id)); + CUDAX_REQUIRE(cudart_id == id); + } +} + +TEST_CASE("ensure current device", "[device]") +{ + test::empty_driver_stack(); + // If possible use something different than CUDART default 0 + int target_device = static_cast(cudax::devices.size() - 1); + int dev_id = 0; + + SECTION("device setter") + { + recursive_check_device_setter(target_device); + + CUDAX_REQUIRE(test::count_driver_stack() == 0); + } + + SECTION("stream interactions with driver stack") + { + { + cudax::stream stream(target_device); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + { + cudax::__ensure_current_device setter(cudax::device_ref{target_device}); + CUDAX_REQUIRE(driver::ctxGetCurrent() == driver::streamGetCtx(stream.get())); + } + { + auto ev = stream.record_event(); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + } + CUDAX_REQUIRE(test::count_driver_stack() == 0); + { + auto ev = stream.record_timed_event(); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + } + { + auto lambda = [&](int dev_id) { + cudax::stream another_stream(dev_id); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + stream.wait(another_stream); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + another_stream.wait(stream); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + }; + lambda(target_device); + if (cudax::devices.size() > 1) + { + lambda(0); + } + } + + cudax::__ensure_current_device setter(stream); + CUDAX_REQUIRE(test::count_driver_stack() == 1); + CUDART(cudaGetDevice(&dev_id)); + CUDAX_REQUIRE(dev_id == target_device); + CUDAX_REQUIRE(driver::ctxGetCurrent() == driver::streamGetCtx(stream.get())); + } + + CHECK(test::count_driver_stack() == 0); + + { + // Check NULL stream ref is handled ok + cudax::__ensure_current_device setter1(cudax::device_ref{target_device}); + cudaStream_t null_stream = nullptr; + auto ref = cuda::stream_ref(null_stream); + auto ctx = driver::ctxGetCurrent(); + CUDAX_REQUIRE(test::count_driver_stack() == 1); + + cudax::__ensure_current_device setter2(ref); + CUDAX_REQUIRE(test::count_driver_stack() == 2); + CUDAX_REQUIRE(ctx == driver::ctxGetCurrent()); + CUDART(cudaGetDevice(&dev_id)); + CUDAX_REQUIRE(dev_id == target_device); + } + } + + SECTION("event interactions with driver stack") + { + { + cudax::stream stream(target_device); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + + cudax::event event(stream); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + + event.record(stream); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + } + CUDAX_REQUIRE(test::count_driver_stack() == 0); + } + + SECTION("launch interactions with driver stack") + { + cudax::stream stream(target_device); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + cudax::launch(stream, cudax::make_hierarchy(cudax::block_dims<1>(), cudax::grid_dims<1>()), test::empty_kernel{}); + CUDAX_REQUIRE(test::count_driver_stack() == 0); + } +} From 1b6dbd40509f96e16e9f34749bbbf9068f9ea9e1 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 6 Aug 2024 12:56:41 -0400 Subject: [PATCH 04/33] Fix singular vs plural typo in thread scope documentation. (#2198) * Fix singular vs plural typo in thread scope documentation. * Better grammar fix. --- docs/libcudacxx/extended_api/memory_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/libcudacxx/extended_api/memory_model.rst b/docs/libcudacxx/extended_api/memory_model.rst index ff9f9ef44c..dfb6ed6789 100644 --- a/docs/libcudacxx/extended_api/memory_model.rst +++ b/docs/libcudacxx/extended_api/memory_model.rst @@ -17,7 +17,7 @@ semantics of standard C++ by default. Thread Scopes ------------- -A **thread scope** specifies the kind of threads that can synchronize with each other using synchronization primitive such +A **thread scope** specifies the kind of threads that can synchronize with each other using a synchronization primitive such as :ref:`atomic ` or :ref:`barrier `. From 2db4fa7232e3250bdd0539a2afee4f1d32a7ab30 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Tue, 6 Aug 2024 11:13:17 -0700 Subject: [PATCH 05/33] [CUDAX] fixing some minor issues with device attribute queries (#2183) * [cudax] give the `cudaDevAttrMemoryPoolSupportedHandleTypes` attribute the correct type * move attribute definitions from `device_ref` to `device` --- .../cuda/experimental/__device/attributes.cuh | 414 ++++++++++-------- .../cuda/experimental/__device/device.cuh | 17 + .../cuda/experimental/__device/device_ref.cuh | 48 +- cudax/test/device/device_smoke.cu | 26 +- 4 files changed, 274 insertions(+), 231 deletions(-) diff --git a/cudax/include/cuda/experimental/__device/attributes.cuh b/cudax/include/cuda/experimental/__device/attributes.cuh index 1c02cc19c9..5a873f6ebb 100644 --- a/cudax/include/cuda/experimental/__device/attributes.cuh +++ b/cudax/include/cuda/experimental/__device/attributes.cuh @@ -24,668 +24,694 @@ #include #include -#include +#include namespace cuda::experimental { namespace detail { +template <::cudaDeviceAttr _Attr> +struct __dev_attr +{ + using type = int; + + _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept + { + return _Attr; + } + + _CCCL_NODISCARD type operator()(device_ref __dev_id) const + { + return __dev_id.attr<_Attr>(); + } +}; + template <::cudaDeviceAttr _Attr, typename _Type> -struct __attr_with_type +struct __dev_attr_with_type { using type = _Type; - constexpr operator ::cudaDeviceAttr() const noexcept + _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept { return _Attr; } - _CCCL_NODISCARD type operator()(device_ref __dev) const + _CCCL_NODISCARD type operator()(device_ref __dev_id) const { - return __dev.attr<_Attr>(); + return __dev_id.attr<_Attr>(); } }; -} // namespace detail // TODO: give this a strong type for kilohertz template <> -struct device_ref::__attr<::cudaDevAttrClockRate> // - : detail::__attr_with_type<::cudaDevAttrClockRate, int> +struct __dev_attr<::cudaDevAttrClockRate> // + : __dev_attr_with_type<::cudaDevAttrClockRate, int> {}; template <> -struct device_ref::__attr<::cudaDevAttrGpuOverlap> // - : detail::__attr_with_type<::cudaDevAttrGpuOverlap, bool> +struct __dev_attr<::cudaDevAttrGpuOverlap> // + : __dev_attr_with_type<::cudaDevAttrGpuOverlap, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrKernelExecTimeout> // - : detail::__attr_with_type<::cudaDevAttrKernelExecTimeout, bool> +struct __dev_attr<::cudaDevAttrKernelExecTimeout> // + : __dev_attr_with_type<::cudaDevAttrKernelExecTimeout, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrIntegrated> // - : detail::__attr_with_type<::cudaDevAttrIntegrated, bool> +struct __dev_attr<::cudaDevAttrIntegrated> // + : __dev_attr_with_type<::cudaDevAttrIntegrated, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrCanMapHostMemory> // - : detail::__attr_with_type<::cudaDevAttrCanMapHostMemory, bool> +struct __dev_attr<::cudaDevAttrCanMapHostMemory> // + : __dev_attr_with_type<::cudaDevAttrCanMapHostMemory, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrComputeMode> // - : detail::__attr_with_type<::cudaDevAttrComputeMode, ::cudaComputeMode> +struct __dev_attr<::cudaDevAttrComputeMode> // + : __dev_attr_with_type<::cudaDevAttrComputeMode, ::cudaComputeMode> { static constexpr type default_mode = cudaComputeModeDefault; static constexpr type prohibited_mode = cudaComputeModeProhibited; static constexpr type exclusive_process_mode = cudaComputeModeExclusiveProcess; }; template <> -struct device_ref::__attr<::cudaDevAttrConcurrentKernels> // - : detail::__attr_with_type<::cudaDevAttrConcurrentKernels, bool> +struct __dev_attr<::cudaDevAttrConcurrentKernels> // + : __dev_attr_with_type<::cudaDevAttrConcurrentKernels, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrEccEnabled> // - : detail::__attr_with_type<::cudaDevAttrEccEnabled, bool> +struct __dev_attr<::cudaDevAttrEccEnabled> // + : __dev_attr_with_type<::cudaDevAttrEccEnabled, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrTccDriver> // - : detail::__attr_with_type<::cudaDevAttrTccDriver, bool> +struct __dev_attr<::cudaDevAttrTccDriver> // + : __dev_attr_with_type<::cudaDevAttrTccDriver, bool> {}; // TODO: give this a strong type for kilohertz template <> -struct device_ref::__attr<::cudaDevAttrMemoryClockRate> // - : detail::__attr_with_type<::cudaDevAttrMemoryClockRate, int> +struct __dev_attr<::cudaDevAttrMemoryClockRate> // + : __dev_attr_with_type<::cudaDevAttrMemoryClockRate, int> {}; // TODO: give this a strong type for bits template <> -struct device_ref::__attr<::cudaDevAttrGlobalMemoryBusWidth> // - : detail::__attr_with_type<::cudaDevAttrGlobalMemoryBusWidth, int> +struct __dev_attr<::cudaDevAttrGlobalMemoryBusWidth> // + : __dev_attr_with_type<::cudaDevAttrGlobalMemoryBusWidth, int> {}; // TODO: give this a strong type for bytes template <> -struct device_ref::__attr<::cudaDevAttrL2CacheSize> // - : detail::__attr_with_type<::cudaDevAttrL2CacheSize, int> +struct __dev_attr<::cudaDevAttrL2CacheSize> // + : __dev_attr_with_type<::cudaDevAttrL2CacheSize, int> {}; template <> -struct device_ref::__attr<::cudaDevAttrUnifiedAddressing> // - : detail::__attr_with_type<::cudaDevAttrUnifiedAddressing, bool> +struct __dev_attr<::cudaDevAttrUnifiedAddressing> // + : __dev_attr_with_type<::cudaDevAttrUnifiedAddressing, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrStreamPrioritiesSupported> // - : detail::__attr_with_type<::cudaDevAttrStreamPrioritiesSupported, bool> +struct __dev_attr<::cudaDevAttrStreamPrioritiesSupported> // + : __dev_attr_with_type<::cudaDevAttrStreamPrioritiesSupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrGlobalL1CacheSupported> // - : detail::__attr_with_type<::cudaDevAttrGlobalL1CacheSupported, bool> +struct __dev_attr<::cudaDevAttrGlobalL1CacheSupported> // + : __dev_attr_with_type<::cudaDevAttrGlobalL1CacheSupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrLocalL1CacheSupported> // - : detail::__attr_with_type<::cudaDevAttrLocalL1CacheSupported, bool> +struct __dev_attr<::cudaDevAttrLocalL1CacheSupported> // + : __dev_attr_with_type<::cudaDevAttrLocalL1CacheSupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrManagedMemory> // - : detail::__attr_with_type<::cudaDevAttrManagedMemory, bool> +struct __dev_attr<::cudaDevAttrManagedMemory> // + : __dev_attr_with_type<::cudaDevAttrManagedMemory, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrIsMultiGpuBoard> // - : detail::__attr_with_type<::cudaDevAttrIsMultiGpuBoard, bool> +struct __dev_attr<::cudaDevAttrIsMultiGpuBoard> // + : __dev_attr_with_type<::cudaDevAttrIsMultiGpuBoard, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrHostNativeAtomicSupported> // - : detail::__attr_with_type<::cudaDevAttrHostNativeAtomicSupported, bool> +struct __dev_attr<::cudaDevAttrHostNativeAtomicSupported> // + : __dev_attr_with_type<::cudaDevAttrHostNativeAtomicSupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrPageableMemoryAccess> // - : detail::__attr_with_type<::cudaDevAttrPageableMemoryAccess, bool> +struct __dev_attr<::cudaDevAttrPageableMemoryAccess> // + : __dev_attr_with_type<::cudaDevAttrPageableMemoryAccess, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrConcurrentManagedAccess> // - : detail::__attr_with_type<::cudaDevAttrConcurrentManagedAccess, bool> +struct __dev_attr<::cudaDevAttrConcurrentManagedAccess> // + : __dev_attr_with_type<::cudaDevAttrConcurrentManagedAccess, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrComputePreemptionSupported> // - : detail::__attr_with_type<::cudaDevAttrComputePreemptionSupported, bool> +struct __dev_attr<::cudaDevAttrComputePreemptionSupported> // + : __dev_attr_with_type<::cudaDevAttrComputePreemptionSupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrCanUseHostPointerForRegisteredMem> // - : detail::__attr_with_type<::cudaDevAttrCanUseHostPointerForRegisteredMem, bool> +struct __dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem> // + : __dev_attr_with_type<::cudaDevAttrCanUseHostPointerForRegisteredMem, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrCooperativeLaunch> // - : detail::__attr_with_type<::cudaDevAttrCooperativeLaunch, bool> +struct __dev_attr<::cudaDevAttrCooperativeLaunch> // + : __dev_attr_with_type<::cudaDevAttrCooperativeLaunch, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrCooperativeMultiDeviceLaunch> // - : detail::__attr_with_type<::cudaDevAttrCooperativeMultiDeviceLaunch, bool> +struct __dev_attr<::cudaDevAttrCooperativeMultiDeviceLaunch> // + : __dev_attr_with_type<::cudaDevAttrCooperativeMultiDeviceLaunch, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrCanFlushRemoteWrites> // - : detail::__attr_with_type<::cudaDevAttrCanFlushRemoteWrites, bool> +struct __dev_attr<::cudaDevAttrCanFlushRemoteWrites> // + : __dev_attr_with_type<::cudaDevAttrCanFlushRemoteWrites, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrHostRegisterSupported> // - : detail::__attr_with_type<::cudaDevAttrHostRegisterSupported, bool> +struct __dev_attr<::cudaDevAttrHostRegisterSupported> // + : __dev_attr_with_type<::cudaDevAttrHostRegisterSupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrDirectManagedMemAccessFromHost> // - : detail::__attr_with_type<::cudaDevAttrDirectManagedMemAccessFromHost, bool> +struct __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost> // + : __dev_attr_with_type<::cudaDevAttrDirectManagedMemAccessFromHost, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrSparseCudaArraySupported> // - : detail::__attr_with_type<::cudaDevAttrSparseCudaArraySupported, bool> +struct __dev_attr<::cudaDevAttrSparseCudaArraySupported> // + : __dev_attr_with_type<::cudaDevAttrSparseCudaArraySupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrMemoryPoolsSupported> // - : detail::__attr_with_type<::cudaDevAttrMemoryPoolsSupported, bool> +struct __dev_attr<::cudaDevAttrMemoryPoolsSupported> // + : __dev_attr_with_type<::cudaDevAttrMemoryPoolsSupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrGPUDirectRDMASupported> // - : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMASupported, bool> +struct __dev_attr<::cudaDevAttrGPUDirectRDMASupported> // + : __dev_attr_with_type<::cudaDevAttrGPUDirectRDMASupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrDeferredMappingCudaArraySupported> // - : detail::__attr_with_type<::cudaDevAttrDeferredMappingCudaArraySupported, bool> +struct __dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported> // + : __dev_attr_with_type<::cudaDevAttrDeferredMappingCudaArraySupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrIpcEventSupport> // - : detail::__attr_with_type<::cudaDevAttrIpcEventSupport, bool> +struct __dev_attr<::cudaDevAttrIpcEventSupport> // + : __dev_attr_with_type<::cudaDevAttrIpcEventSupport, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables> - : detail::__attr_with_type<::cudaDevAttrPageableMemoryAccessUsesHostPageTables, bool> +struct __dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables> + : __dev_attr_with_type<::cudaDevAttrPageableMemoryAccessUsesHostPageTables, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrHostRegisterReadOnlySupported> // - : detail::__attr_with_type<::cudaDevAttrHostRegisterReadOnlySupported, bool> +struct __dev_attr<::cudaDevAttrHostRegisterReadOnlySupported> // + : __dev_attr_with_type<::cudaDevAttrHostRegisterReadOnlySupported, bool> {}; template <> -struct device_ref::__attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions> // - : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMAFlushWritesOptions, ::cudaFlushGPUDirectRDMAWritesOptions> +struct __dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions> // + : __dev_attr_with_type<::cudaDevAttrGPUDirectRDMAFlushWritesOptions, ::cudaFlushGPUDirectRDMAWritesOptions> { static constexpr type host = ::cudaFlushGPUDirectRDMAWritesOptionHost; static constexpr type mem_ops = ::cudaFlushGPUDirectRDMAWritesOptionMemOps; }; template <> -struct device_ref::__attr<::cudaDevAttrGPUDirectRDMAWritesOrdering> // - : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMAWritesOrdering, ::cudaGPUDirectRDMAWritesOrdering> +struct __dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering> // + : __dev_attr_with_type<::cudaDevAttrGPUDirectRDMAWritesOrdering, ::cudaGPUDirectRDMAWritesOrdering> { static constexpr type none = ::cudaGPUDirectRDMAWritesOrderingNone; static constexpr type owner = ::cudaGPUDirectRDMAWritesOrderingOwner; static constexpr type all_devices = ::cudaGPUDirectRDMAWritesOrderingAllDevices; }; -// TODO: This is a bitmask. What are the possible values? template <> -struct device_ref::__attr<::cudaDevAttrMemoryPoolSupportedHandleTypes> // - : detail::__attr_with_type<::cudaDevAttrMemoryPoolSupportedHandleTypes, unsigned int> -{}; +struct __dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes> // + : __dev_attr_with_type<::cudaDevAttrMemoryPoolSupportedHandleTypes, ::cudaMemAllocationHandleType> +{ + static constexpr type none = ::cudaMemHandleTypeNone; + static constexpr type posix_file_descriptor = ::cudaMemHandleTypePosixFileDescriptor; + static constexpr type win32 = ::cudaMemHandleTypeWin32; + static constexpr type win32_kmt = ::cudaMemHandleTypeWin32Kmt; +#if CUDART_VERSION >= 12040 + static constexpr type fabric = ::cudaMemHandleTypeFabric; +#else + static constexpr type fabric = static_cast<::cudaMemAllocationHandleType>(0x8); +#endif +}; #if CUDART_VERSION >= 12020 template <> -struct device_ref::__attr<::cudaDevAttrNumaConfig> // - : detail::__attr_with_type<::cudaDevAttrNumaConfig, ::cudaDeviceNumaConfig> +struct __dev_attr<::cudaDevAttrNumaConfig> // + : __dev_attr_with_type<::cudaDevAttrNumaConfig, ::cudaDeviceNumaConfig> { static constexpr type none = ::cudaDeviceNumaConfigNone; static constexpr type numa_node = ::cudaDeviceNumaConfigNumaNode; }; #endif +} // namespace detail -struct device_ref::attrs +struct device::attrs { // Maximum number of threads per block - using max_threads_per_block_t = __attr<::cudaDevAttrMaxThreadsPerBlock>; + using max_threads_per_block_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerBlock>; static constexpr max_threads_per_block_t max_threads_per_block{}; // Maximum x-dimension of a block - using max_block_dim_x_t = __attr<::cudaDevAttrMaxBlockDimX>; + using max_block_dim_x_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimX>; static constexpr max_block_dim_x_t max_block_dim_x{}; // Maximum y-dimension of a block - using max_block_dim_y_t = __attr<::cudaDevAttrMaxBlockDimY>; + using max_block_dim_y_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimY>; static constexpr max_block_dim_y_t max_block_dim_y{}; // Maximum z-dimension of a block - using max_block_dim_z_t = __attr<::cudaDevAttrMaxBlockDimZ>; + using max_block_dim_z_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimZ>; static constexpr max_block_dim_z_t max_block_dim_z{}; // Maximum x-dimension of a grid - using max_grid_dim_x_t = __attr<::cudaDevAttrMaxGridDimX>; + using max_grid_dim_x_t = detail::__dev_attr<::cudaDevAttrMaxGridDimX>; static constexpr max_grid_dim_x_t max_grid_dim_x{}; // Maximum y-dimension of a grid - using max_grid_dim_y_t = __attr<::cudaDevAttrMaxGridDimY>; + using max_grid_dim_y_t = detail::__dev_attr<::cudaDevAttrMaxGridDimY>; static constexpr max_grid_dim_y_t max_grid_dim_y{}; // Maximum z-dimension of a grid - using max_grid_dim_z_t = __attr<::cudaDevAttrMaxGridDimZ>; + using max_grid_dim_z_t = detail::__dev_attr<::cudaDevAttrMaxGridDimZ>; static constexpr max_grid_dim_z_t max_grid_dim_z{}; // Maximum amount of shared memory available to a thread block in bytes - using max_shared_memory_per_block_t = __attr<::cudaDevAttrMaxSharedMemoryPerBlock>; + using max_shared_memory_per_block_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>; static constexpr max_shared_memory_per_block_t max_shared_memory_per_block{}; // Memory available on device for __constant__ variables in a CUDA C kernel in bytes - using total_constant_memory_t = __attr<::cudaDevAttrTotalConstantMemory>; + using total_constant_memory_t = detail::__dev_attr<::cudaDevAttrTotalConstantMemory>; static constexpr total_constant_memory_t total_constant_memory{}; // Warp size in threads - using warp_size_t = __attr<::cudaDevAttrWarpSize>; + using warp_size_t = detail::__dev_attr<::cudaDevAttrWarpSize>; static constexpr warp_size_t warp_size{}; // Maximum pitch in bytes allowed by the memory copy functions that involve // memory regions allocated through cudaMallocPitch() - using max_pitch_t = __attr<::cudaDevAttrMaxPitch>; + using max_pitch_t = detail::__dev_attr<::cudaDevAttrMaxPitch>; static constexpr max_pitch_t max_pitch{}; // Maximum 1D texture width - using max_texture_1d_width_t = __attr<::cudaDevAttrMaxTexture1DWidth>; + using max_texture_1d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DWidth>; static constexpr max_texture_1d_width_t max_texture_1d_width{}; // Maximum width for a 1D texture bound to linear memory - using max_texture_1d_linear_width_t = __attr<::cudaDevAttrMaxTexture1DLinearWidth>; + using max_texture_1d_linear_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>; static constexpr max_texture_1d_linear_width_t max_texture_1d_linear_width{}; // Maximum mipmapped 1D texture width - using max_texture_1d_mipmapped_width_t = __attr<::cudaDevAttrMaxTexture1DMipmappedWidth>; + using max_texture_1d_mipmapped_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>; static constexpr max_texture_1d_mipmapped_width_t max_texture_1d_mipmapped_width{}; // Maximum 2D texture width - using max_texture_2d_width_t = __attr<::cudaDevAttrMaxTexture2DWidth>; + using max_texture_2d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DWidth>; static constexpr max_texture_2d_width_t max_texture_2d_width{}; // Maximum 2D texture height - using max_texture_2d_height_t = __attr<::cudaDevAttrMaxTexture2DHeight>; + using max_texture_2d_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DHeight>; static constexpr max_texture_2d_height_t max_texture_2d_height{}; // Maximum width for a 2D texture bound to linear memory - using max_texture_2d_linear_width_t = __attr<::cudaDevAttrMaxTexture2DLinearWidth>; + using max_texture_2d_linear_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>; static constexpr max_texture_2d_linear_width_t max_texture_2d_linear_width{}; // Maximum height for a 2D texture bound to linear memory - using max_texture_2d_linear_height_t = __attr<::cudaDevAttrMaxTexture2DLinearHeight>; + using max_texture_2d_linear_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>; static constexpr max_texture_2d_linear_height_t max_texture_2d_linear_height{}; // Maximum pitch in bytes for a 2D texture bound to linear memory - using max_texture_2d_linear_pitch_t = __attr<::cudaDevAttrMaxTexture2DLinearPitch>; + using max_texture_2d_linear_pitch_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>; static constexpr max_texture_2d_linear_pitch_t max_texture_2d_linear_pitch{}; // Maximum mipmapped 2D texture width - using max_texture_2d_mipmapped_width_t = __attr<::cudaDevAttrMaxTexture2DMipmappedWidth>; + using max_texture_2d_mipmapped_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>; static constexpr max_texture_2d_mipmapped_width_t max_texture_2d_mipmapped_width{}; // Maximum mipmapped 2D texture height - using max_texture_2d_mipmapped_height_t = __attr<::cudaDevAttrMaxTexture2DMipmappedHeight>; + using max_texture_2d_mipmapped_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>; static constexpr max_texture_2d_mipmapped_height_t max_texture_2d_mipmapped_height{}; // Maximum 3D texture width - using max_texture_3d_width_t = __attr<::cudaDevAttrMaxTexture3DWidth>; + using max_texture_3d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DWidth>; static constexpr max_texture_3d_width_t max_texture_3d_width{}; // Maximum 3D texture height - using max_texture_3d_height_t = __attr<::cudaDevAttrMaxTexture3DHeight>; + using max_texture_3d_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DHeight>; static constexpr max_texture_3d_height_t max_texture_3d_height{}; // Maximum 3D texture depth - using max_texture_3d_depth_t = __attr<::cudaDevAttrMaxTexture3DDepth>; + using max_texture_3d_depth_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DDepth>; static constexpr max_texture_3d_depth_t max_texture_3d_depth{}; // Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported - using max_texture_3d_width_alt_t = __attr<::cudaDevAttrMaxTexture3DWidthAlt>; + using max_texture_3d_width_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>; static constexpr max_texture_3d_width_alt_t max_texture_3d_width_alt{}; // Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported - using max_texture_3d_height_alt_t = __attr<::cudaDevAttrMaxTexture3DHeightAlt>; + using max_texture_3d_height_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>; static constexpr max_texture_3d_height_alt_t max_texture_3d_height_alt{}; // Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported - using max_texture_3d_depth_alt_t = __attr<::cudaDevAttrMaxTexture3DDepthAlt>; + using max_texture_3d_depth_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>; static constexpr max_texture_3d_depth_alt_t max_texture_3d_depth_alt{}; // Maximum cubemap texture width or height - using max_texture_cubemap_width_t = __attr<::cudaDevAttrMaxTextureCubemapWidth>; + using max_texture_cubemap_width_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapWidth>; static constexpr max_texture_cubemap_width_t max_texture_cubemap_width{}; // Maximum 1D layered texture width - using max_texture_1d_layered_width_t = __attr<::cudaDevAttrMaxTexture1DLayeredWidth>; + using max_texture_1d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>; static constexpr max_texture_1d_layered_width_t max_texture_1d_layered_width{}; // Maximum layers in a 1D layered texture - using max_texture_1d_layered_layers_t = __attr<::cudaDevAttrMaxTexture1DLayeredLayers>; + using max_texture_1d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>; static constexpr max_texture_1d_layered_layers_t max_texture_1d_layered_layers{}; // Maximum 2D layered texture width - using max_texture_2d_layered_width_t = __attr<::cudaDevAttrMaxTexture2DLayeredWidth>; + using max_texture_2d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>; static constexpr max_texture_2d_layered_width_t max_texture_2d_layered_width{}; // Maximum 2D layered texture height - using max_texture_2d_layered_height_t = __attr<::cudaDevAttrMaxTexture2DLayeredHeight>; + using max_texture_2d_layered_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>; static constexpr max_texture_2d_layered_height_t max_texture_2d_layered_height{}; // Maximum layers in a 2D layered texture - using max_texture_2d_layered_layers_t = __attr<::cudaDevAttrMaxTexture2DLayeredLayers>; + using max_texture_2d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>; static constexpr max_texture_2d_layered_layers_t max_texture_2d_layered_layers{}; // Maximum cubemap layered texture width or height - using max_texture_cubemap_layered_width_t = __attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>; + using max_texture_cubemap_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>; static constexpr max_texture_cubemap_layered_width_t max_texture_cubemap_layered_width{}; // Maximum layers in a cubemap layered texture - using max_texture_cubemap_layered_layers_t = __attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>; + using max_texture_cubemap_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>; static constexpr max_texture_cubemap_layered_layers_t max_texture_cubemap_layered_layers{}; // Maximum 1D surface width - using max_surface_1d_width_t = __attr<::cudaDevAttrMaxSurface1DWidth>; + using max_surface_1d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DWidth>; static constexpr max_surface_1d_width_t max_surface_1d_width{}; // Maximum 2D surface width - using max_surface_2d_width_t = __attr<::cudaDevAttrMaxSurface2DWidth>; + using max_surface_2d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DWidth>; static constexpr max_surface_2d_width_t max_surface_2d_width{}; // Maximum 2D surface height - using max_surface_2d_height_t = __attr<::cudaDevAttrMaxSurface2DHeight>; + using max_surface_2d_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DHeight>; static constexpr max_surface_2d_height_t max_surface_2d_height{}; // Maximum 3D surface width - using max_surface_3d_width_t = __attr<::cudaDevAttrMaxSurface3DWidth>; + using max_surface_3d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DWidth>; static constexpr max_surface_3d_width_t max_surface_3d_width{}; // Maximum 3D surface height - using max_surface_3d_height_t = __attr<::cudaDevAttrMaxSurface3DHeight>; + using max_surface_3d_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DHeight>; static constexpr max_surface_3d_height_t max_surface_3d_height{}; // Maximum 3D surface depth - using max_surface_3d_depth_t = __attr<::cudaDevAttrMaxSurface3DDepth>; + using max_surface_3d_depth_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DDepth>; static constexpr max_surface_3d_depth_t max_surface_3d_depth{}; // Maximum 1D layered surface width - using max_surface_1d_layered_width_t = __attr<::cudaDevAttrMaxSurface1DLayeredWidth>; + using max_surface_1d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>; static constexpr max_surface_1d_layered_width_t max_surface_1d_layered_width{}; // Maximum layers in a 1D layered surface - using max_surface_1d_layered_layers_t = __attr<::cudaDevAttrMaxSurface1DLayeredLayers>; + using max_surface_1d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>; static constexpr max_surface_1d_layered_layers_t max_surface_1d_layered_layers{}; // Maximum 2D layered surface width - using max_surface_2d_layered_width_t = __attr<::cudaDevAttrMaxSurface2DLayeredWidth>; + using max_surface_2d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>; static constexpr max_surface_2d_layered_width_t max_surface_2d_layered_width{}; // Maximum 2D layered surface height - using max_surface_2d_layered_height_t = __attr<::cudaDevAttrMaxSurface2DLayeredHeight>; + using max_surface_2d_layered_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>; static constexpr max_surface_2d_layered_height_t max_surface_2d_layered_height{}; // Maximum layers in a 2D layered surface - using max_surface_2d_layered_layers_t = __attr<::cudaDevAttrMaxSurface2DLayeredLayers>; + using max_surface_2d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>; static constexpr max_surface_2d_layered_layers_t max_surface_2d_layered_layers{}; // Maximum cubemap surface width - using max_surface_cubemap_width_t = __attr<::cudaDevAttrMaxSurfaceCubemapWidth>; + using max_surface_cubemap_width_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>; static constexpr max_surface_cubemap_width_t max_surface_cubemap_width{}; // Maximum cubemap layered surface width - using max_surface_cubemap_layered_width_t = __attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>; + using max_surface_cubemap_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>; static constexpr max_surface_cubemap_layered_width_t max_surface_cubemap_layered_width{}; // Maximum layers in a cubemap layered surface - using max_surface_cubemap_layered_layers_t = __attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>; + using max_surface_cubemap_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>; static constexpr max_surface_cubemap_layered_layers_t max_surface_cubemap_layered_layers{}; // Maximum number of 32-bit registers available to a thread block - using max_registers_per_block_t = __attr<::cudaDevAttrMaxRegistersPerBlock>; + using max_registers_per_block_t = detail::__dev_attr<::cudaDevAttrMaxRegistersPerBlock>; static constexpr max_registers_per_block_t max_registers_per_block{}; // Peak clock frequency in kilohertz - using clock_rate_t = __attr<::cudaDevAttrClockRate>; + using clock_rate_t = detail::__dev_attr<::cudaDevAttrClockRate>; static constexpr clock_rate_t clock_rate{}; // Alignment requirement; texture base addresses aligned to textureAlign bytes // do not need an offset applied to texture fetches - using texture_alignment_t = __attr<::cudaDevAttrTextureAlignment>; + using texture_alignment_t = detail::__dev_attr<::cudaDevAttrTextureAlignment>; static constexpr texture_alignment_t texture_alignment{}; // Pitch alignment requirement for 2D texture references bound to pitched memory - using texture_pitch_alignment_t = __attr<::cudaDevAttrTexturePitchAlignment>; + using texture_pitch_alignment_t = detail::__dev_attr<::cudaDevAttrTexturePitchAlignment>; static constexpr texture_pitch_alignment_t texture_pitch_alignment{}; // true if the device can concurrently copy memory between host and device // while executing a kernel, or false if not - using gpu_overlap_t = __attr<::cudaDevAttrGpuOverlap>; + using gpu_overlap_t = detail::__dev_attr<::cudaDevAttrGpuOverlap>; static constexpr gpu_overlap_t gpu_overlap{}; // Number of multiprocessors on the device - using multi_processor_count_t = __attr<::cudaDevAttrMultiProcessorCount>; + using multi_processor_count_t = detail::__dev_attr<::cudaDevAttrMultiProcessorCount>; static constexpr multi_processor_count_t multi_processor_count{}; // true if there is a run time limit for kernels executed on the device, or // false if not - using kernel_exec_timeout_t = __attr<::cudaDevAttrKernelExecTimeout>; + using kernel_exec_timeout_t = detail::__dev_attr<::cudaDevAttrKernelExecTimeout>; static constexpr kernel_exec_timeout_t kernel_exec_timeout{}; // true if the device is integrated with the memory subsystem, or false if not - using integrated_t = __attr<::cudaDevAttrIntegrated>; + using integrated_t = detail::__dev_attr<::cudaDevAttrIntegrated>; static constexpr integrated_t integrated{}; // true if the d - using can_map_host_memory_t = __attr<::cudaDevAttrCanMapHostMemory>; + using can_map_host_memory_t = detail::__dev_attr<::cudaDevAttrCanMapHostMemory>; static constexpr can_map_host_memory_t can_map_host_memory{}; // Compute mode is the compute mode that the device is currently in. - using compute_mode_t = __attr<::cudaDevAttrComputeMode>; + using compute_mode_t = detail::__dev_attr<::cudaDevAttrComputeMode>; static constexpr compute_mode_t compute_mode{}; // true if the device supports executing multiple kernels within the same // context simultaneously, or false if not. It is not guaranteed that multiple // kernels will be resident on the device concurrently so this feature should // not be relied upon for correctness. - using concurrent_kernels_t = __attr<::cudaDevAttrConcurrentKernels>; + using concurrent_kernels_t = detail::__dev_attr<::cudaDevAttrConcurrentKernels>; static constexpr concurrent_kernels_t concurrent_kernels{}; // true if error correction is enabled on the device, 0 if error correction is // disabled or not supported by the device - using ecc_enabled_t = __attr<::cudaDevAttrEccEnabled>; + using ecc_enabled_t = detail::__dev_attr<::cudaDevAttrEccEnabled>; static constexpr ecc_enabled_t ecc_enabled{}; // PCI bus identifier of the device - using pci_bus_id_t = __attr<::cudaDevAttrPciBusId>; + using pci_bus_id_t = detail::__dev_attr<::cudaDevAttrPciBusId>; static constexpr pci_bus_id_t pci_bus_id{}; // PCI device (also known as slot) identifier of the device - using pci_device_id_t = __attr<::cudaDevAttrPciDeviceId>; + using pci_device_id_t = detail::__dev_attr<::cudaDevAttrPciDeviceId>; static constexpr pci_device_id_t pci_device_id{}; // true if the device is using a TCC driver. TCC is only available on Tesla // hardware running Windows Vista or later. - using tcc_driver_t = __attr<::cudaDevAttrTccDriver>; + using tcc_driver_t = detail::__dev_attr<::cudaDevAttrTccDriver>; static constexpr tcc_driver_t tcc_driver{}; // Peak memory clock frequency in kilohertz - using memory_clock_rate_t = __attr<::cudaDevAttrMemoryClockRate>; + using memory_clock_rate_t = detail::__dev_attr<::cudaDevAttrMemoryClockRate>; static constexpr memory_clock_rate_t memory_clock_rate{}; // Global memory bus width in bits - using global_memory_bus_width_t = __attr<::cudaDevAttrGlobalMemoryBusWidth>; + using global_memory_bus_width_t = detail::__dev_attr<::cudaDevAttrGlobalMemoryBusWidth>; static constexpr global_memory_bus_width_t global_memory_bus_width{}; // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. - using l2_cache_size_t = __attr<::cudaDevAttrL2CacheSize>; + using l2_cache_size_t = detail::__dev_attr<::cudaDevAttrL2CacheSize>; static constexpr l2_cache_size_t l2_cache_size{}; // Maximum resident threads per multiprocessor - using max_threads_per_multi_processor_t = __attr<::cudaDevAttrMaxThreadsPerMultiProcessor>; + using max_threads_per_multi_processor_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>; static constexpr max_threads_per_multi_processor_t max_threads_per_multi_processor{}; // true if the device shares a unified address space with the host, or false // if not - using unified_addressing_t = __attr<::cudaDevAttrUnifiedAddressing>; + using unified_addressing_t = detail::__dev_attr<::cudaDevAttrUnifiedAddressing>; static constexpr unified_addressing_t unified_addressing{}; // Major compute capability version number - using compute_capability_major_t = __attr<::cudaDevAttrComputeCapabilityMajor>; + using compute_capability_major_t = detail::__dev_attr<::cudaDevAttrComputeCapabilityMajor>; static constexpr compute_capability_major_t compute_capability_major{}; // Minor compute capability version number - using compute_capability_minor_t = __attr<::cudaDevAttrComputeCapabilityMinor>; + using compute_capability_minor_t = detail::__dev_attr<::cudaDevAttrComputeCapabilityMinor>; static constexpr compute_capability_minor_t compute_capability_minor{}; // true if the device supports stream priorities, or false if not - using stream_priorities_supported_t = __attr<::cudaDevAttrStreamPrioritiesSupported>; + using stream_priorities_supported_t = detail::__dev_attr<::cudaDevAttrStreamPrioritiesSupported>; static constexpr stream_priorities_supported_t stream_priorities_supported{}; // true if device supports caching globals in L1 cache, false if not - using global_l1_cache_supported_t = __attr<::cudaDevAttrGlobalL1CacheSupported>; + using global_l1_cache_supported_t = detail::__dev_attr<::cudaDevAttrGlobalL1CacheSupported>; static constexpr global_l1_cache_supported_t global_l1_cache_supported{}; // true if device supports caching locals in L1 cache, false if not - using local_l1_cache_supported_t = __attr<::cudaDevAttrLocalL1CacheSupported>; + using local_l1_cache_supported_t = detail::__dev_attr<::cudaDevAttrLocalL1CacheSupported>; static constexpr local_l1_cache_supported_t local_l1_cache_supported{}; // Maximum amount of shared memory available to a multiprocessor in bytes; // this amount is shared by all thread blocks simultaneously resident on a // multiprocessor - using max_shared_memory_per_multiprocessor_t = __attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>; + using max_shared_memory_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>; static constexpr max_shared_memory_per_multiprocessor_t max_shared_memory_per_multiprocessor{}; // Maximum number of 32-bit registers available to a multiprocessor; this // number is shared by all thread blocks simultaneously resident on a // multiprocessor - using max_registers_per_multiprocessor_t = __attr<::cudaDevAttrMaxRegistersPerMultiprocessor>; + using max_registers_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>; static constexpr max_registers_per_multiprocessor_t max_registers_per_multiprocessor{}; // true if device supports allocating managed memory, false if not - using managed_memory_t = __attr<::cudaDevAttrManagedMemory>; + using managed_memory_t = detail::__dev_attr<::cudaDevAttrManagedMemory>; static constexpr managed_memory_t managed_memory{}; // true if device is on a multi-GPU board, false if not - using is_multi_gpu_board_t = __attr<::cudaDevAttrIsMultiGpuBoard>; + using is_multi_gpu_board_t = detail::__dev_attr<::cudaDevAttrIsMultiGpuBoard>; static constexpr is_multi_gpu_board_t is_multi_gpu_board{}; // Unique identifier for a group of devices on the same multi-GPU board - using multi_gpu_board_group_id_t = __attr<::cudaDevAttrMultiGpuBoardGroupID>; + using multi_gpu_board_group_id_t = detail::__dev_attr<::cudaDevAttrMultiGpuBoardGroupID>; static constexpr multi_gpu_board_group_id_t multi_gpu_board_group_id{}; // true if the link between the device and the host supports native atomic // operations - using host_native_atomic_supported_t = __attr<::cudaDevAttrHostNativeAtomicSupported>; + using host_native_atomic_supported_t = detail::__dev_attr<::cudaDevAttrHostNativeAtomicSupported>; static constexpr host_native_atomic_supported_t host_native_atomic_supported{}; // Ratio of single precision performance (in floating-point operations per // second) to double precision performance - using single_to_double_precision_perf_ratio_t = __attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>; + using single_to_double_precision_perf_ratio_t = detail::__dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>; static constexpr single_to_double_precision_perf_ratio_t single_to_double_precision_perf_ratio{}; // true if the device supports coherently accessing pageable memory without // calling cudaHostRegister on it, and false otherwise - using pageable_memory_access_t = __attr<::cudaDevAttrPageableMemoryAccess>; + using pageable_memory_access_t = detail::__dev_attr<::cudaDevAttrPageableMemoryAccess>; static constexpr pageable_memory_access_t pageable_memory_access{}; // true if the device can coherently access managed memory concurrently with // the CPU, and false otherwise - using concurrent_managed_access_t = __attr<::cudaDevAttrConcurrentManagedAccess>; + using concurrent_managed_access_t = detail::__dev_attr<::cudaDevAttrConcurrentManagedAccess>; static constexpr concurrent_managed_access_t concurrent_managed_access{}; // true if the device supports Compute Preemption, false if not - using compute_preemption_supported_t = __attr<::cudaDevAttrComputePreemptionSupported>; + using compute_preemption_supported_t = detail::__dev_attr<::cudaDevAttrComputePreemptionSupported>; static constexpr compute_preemption_supported_t compute_preemption_supported{}; // true if the device can access host registered memory at the same virtual // address as the CPU, and false otherwise - using can_use_host_pointer_for_registered_mem_t = __attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>; + using can_use_host_pointer_for_registered_mem_t = detail::__dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>; static constexpr can_use_host_pointer_for_registered_mem_t can_use_host_pointer_for_registered_mem{}; // true if the device supports launching cooperative kernels via // cudaLaunchCooperativeKernel, and false otherwise - using cooperative_launch_t = __attr<::cudaDevAttrCooperativeLaunch>; + using cooperative_launch_t = detail::__dev_attr<::cudaDevAttrCooperativeLaunch>; static constexpr cooperative_launch_t cooperative_launch{}; // true if the device supports launching cooperative kernels via // cudaLaunchCooperativeKernelMultiDevice, and false otherwise - using cooperative_multi_device_launch_t = __attr<::cudaDevAttrCooperativeMultiDeviceLaunch>; + using cooperative_multi_device_launch_t = detail::__dev_attr<::cudaDevAttrCooperativeMultiDeviceLaunch>; static constexpr cooperative_multi_device_launch_t cooperative_multi_device_launch{}; // true if the device supports flushing of outstanding remote writes, and // false otherwise - using can_flush_remote_writes_t = __attr<::cudaDevAttrCanFlushRemoteWrites>; + using can_flush_remote_writes_t = detail::__dev_attr<::cudaDevAttrCanFlushRemoteWrites>; static constexpr can_flush_remote_writes_t can_flush_remote_writes{}; // true if the device supports host memory registration via cudaHostRegister, // and false otherwise - using host_register_supported_t = __attr<::cudaDevAttrHostRegisterSupported>; + using host_register_supported_t = detail::__dev_attr<::cudaDevAttrHostRegisterSupported>; static constexpr host_register_supported_t host_register_supported{}; // true if the device accesses pageable memory via the host's page tables, and // false otherwise - using pageable_memory_access_uses_host_page_tables_t = __attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>; + using pageable_memory_access_uses_host_page_tables_t = + detail::__dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>; static constexpr pageable_memory_access_uses_host_page_tables_t pageable_memory_access_uses_host_page_tables{}; // true if the host can directly access managed memory on the device without // migration, and false otherwise - using direct_managed_mem_access_from_host_t = __attr<::cudaDevAttrDirectManagedMemAccessFromHost>; + using direct_managed_mem_access_from_host_t = detail::__dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>; static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{}; // Maximum per block shared memory size on the device. This value can be opted // into when using cudaFuncSetAttribute - using max_shared_memory_per_block_optin_t = __attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>; + using max_shared_memory_per_block_optin_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>; static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{}; // Maximum number of thread blocks that can reside on a multiprocessor - using max_blocks_per_multiprocessor_t = __attr<::cudaDevAttrMaxBlocksPerMultiprocessor>; + using max_blocks_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>; static constexpr max_blocks_per_multiprocessor_t max_blocks_per_multiprocessor{}; // Maximum L2 persisting lines capacity setting in bytes - using max_persisting_l2_cache_size_t = __attr<::cudaDevAttrMaxPersistingL2CacheSize>; + using max_persisting_l2_cache_size_t = detail::__dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>; static constexpr max_persisting_l2_cache_size_t max_persisting_l2_cache_size{}; // Maximum value of cudaAccessPolicyWindow::num_bytes - using max_access_policy_window_size_t = __attr<::cudaDevAttrMaxAccessPolicyWindowSize>; + using max_access_policy_window_size_t = detail::__dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>; static constexpr max_access_policy_window_size_t max_access_policy_window_size{}; // Shared memory reserved by CUDA driver per block in bytes - using reserved_shared_memory_per_block_t = __attr<::cudaDevAttrReservedSharedMemoryPerBlock>; + using reserved_shared_memory_per_block_t = detail::__dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>; static constexpr reserved_shared_memory_per_block_t reserved_shared_memory_per_block{}; // true if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. - using sparse_cuda_array_supported_t = __attr<::cudaDevAttrSparseCudaArraySupported>; + using sparse_cuda_array_supported_t = detail::__dev_attr<::cudaDevAttrSparseCudaArraySupported>; static constexpr sparse_cuda_array_supported_t sparse_cuda_array_supported{}; // Device supports using the cudaHostRegister flag cudaHostRegisterReadOnly to // register memory that must be mapped as read-only to the GPU - using host_register_read_only_supported_t = __attr<::cudaDevAttrHostRegisterReadOnlySupported>; + using host_register_read_only_supported_t = detail::__dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>; static constexpr host_register_read_only_supported_t host_register_read_only_supported{}; // true if the device supports using the cudaMallocAsync and cudaMemPool // family of APIs, and false otherwise - using memory_pools_supported_t = __attr<::cudaDevAttrMemoryPoolsSupported>; + using memory_pools_supported_t = detail::__dev_attr<::cudaDevAttrMemoryPoolsSupported>; static constexpr memory_pools_supported_t memory_pools_supported{}; // true if the device supports GPUDirect RDMA APIs, and false otherwise - using gpu_direct_rdma_supported_t = __attr<::cudaDevAttrGPUDirectRDMASupported>; + using gpu_direct_rdma_supported_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMASupported>; static constexpr gpu_direct_rdma_supported_t gpu_direct_rdma_supported{}; // bitmask to be interpreted according to the // cudaFlushGPUDirectRDMAWritesOptions enum - using gpu_direct_rdma_flush_writes_options_t = __attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>; + using gpu_direct_rdma_flush_writes_options_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>; static constexpr gpu_direct_rdma_flush_writes_options_t gpu_direct_rdma_flush_writes_options{}; // see the cudaGPUDirectRDMAWritesOrdering enum for numerical values - using gpu_direct_rdma_writes_ordering_t = __attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>; + using gpu_direct_rdma_writes_ordering_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>; static constexpr gpu_direct_rdma_writes_ordering_t gpu_direct_rdma_writes_ordering{}; // Bitmask of handle types supported with mempool based IPC - using memory_pool_supported_handle_types_t = __attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>; + using memory_pool_supported_handle_types_t = detail::__dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>; static constexpr memory_pool_supported_handle_types_t memory_pool_supported_handle_types{}; // true if the device supports deferred mapping CUDA arrays and CUDA mipmapped // arrays. - using deferred_mapping_cuda_array_supported_t = __attr<::cudaDevAttrDeferredMappingCudaArraySupported>; + using deferred_mapping_cuda_array_supported_t = detail::__dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>; static constexpr deferred_mapping_cuda_array_supported_t deferred_mapping_cuda_array_supported{}; // true if the device supports IPC Events, false otherwise. - using ipc_event_support_t = __attr<::cudaDevAttrIpcEventSupport>; + using ipc_event_support_t = detail::__dev_attr<::cudaDevAttrIpcEventSupport>; static constexpr ipc_event_support_t ipc_event_support{}; #if CUDART_VERSION >= 12020 // NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum - using numa_config_t = __attr<::cudaDevAttrNumaConfig>; + using numa_config_t = detail::__dev_attr<::cudaDevAttrNumaConfig>; static constexpr numa_config_t numa_config{}; // NUMA node ID of the GPU memory - using numa_id_t = __attr<::cudaDevAttrNumaId>; + using numa_id_t = detail::__dev_attr<::cudaDevAttrNumaId>; static constexpr numa_id_t numa_id{}; #endif // CUDART_VERSION >= 12020 diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh index 35e0cfe2d4..5532e8f59b 100644 --- a/cudax/include/cuda/experimental/__device/device.cuh +++ b/cudax/include/cuda/experimental/__device/device.cuh @@ -52,6 +52,20 @@ struct __emplace_device class device : public device_ref { public: + struct attrs; + + //! @brief For a given attribute, returns the type of the attribute value. + //! + //! @par Example + //! @code + //! using threads_per_block_t = device::attr_result_t; + //! static_assert(std::is_same_v); + //! @endcode + //! + //! @sa device::attrs + template <::cudaDeviceAttr _Attr> + using attr_result_t = typename detail::__dev_attr<_Attr>::type; + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document # if defined(_CCCL_COMPILER_MSVC) // When __EDG__ is defined, std::construct_at will not permit constructing @@ -91,6 +105,9 @@ private: mutable CUdevice __device{}; mutable ::std::once_flag __init_once; + // TODO: put a mutable thread-safe (or thread_local) cache of device + // properties here. + explicit constexpr device(int __id) noexcept : device_ref(__id) {} diff --git a/cudax/include/cuda/experimental/__device/device_ref.cuh b/cudax/include/cuda/experimental/__device/device_ref.cuh index 7f2635611f..91e4e90caa 100644 --- a/cudax/include/cuda/experimental/__device/device_ref.cuh +++ b/cudax/include/cuda/experimental/__device/device_ref.cuh @@ -27,6 +27,12 @@ namespace cuda::experimental { class device; +namespace detail +{ +template <::cudaDeviceAttr _Attr> +struct __dev_attr; +} // namespace detail + //! @brief A non-owning representation of a CUDA device class device_ref { @@ -34,37 +40,7 @@ class device_ref int __id_ = 0; - template <::cudaDeviceAttr _Attr> - struct __attr - { - using type = int; - - _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept - { - return _Attr; - } - - _CCCL_NODISCARD type operator()(device_ref __dev) const - { - return __dev.attr<_Attr>(); - } - }; - public: - struct attrs; - - //! @brief For a given attribute, returns the type of the attribute value. - //! - //! @par Example - //! @code - //! using threads_per_block_t = device_ref::attr_result_t; - //! static_assert(std::is_same_v); - //! @endcode - //! - //! @sa device_ref::attrs - template <::cudaDeviceAttr _Attr> - using attr_result_t = typename __attr<_Attr>::type; - //! @brief Create a `device_ref` object from a native device ordinal. /*implicit*/ constexpr device_ref(int __id) noexcept : __id_(__id) @@ -78,27 +54,27 @@ public: return __id_; } - //! @brief Retrieve the specified attribute for the `device_ref` + //! @brief Retrieve the specified attribute for the device //! - //! @param __attr The attribute to query. See `device_ref::attrs` for the available + //! @param __attr The attribute to query. See `device::attrs` for the available //! attributes. //! //! @throws cuda_error if the attribute query fails //! - //! @sa device_ref::attrs + //! @sa device::attrs template <::cudaDeviceAttr _Attr> - _CCCL_NODISCARD auto attr([[maybe_unused]] device_ref::__attr<_Attr> __attr) const + _CCCL_NODISCARD auto attr([[maybe_unused]] detail::__dev_attr<_Attr> __attr) const { int __value = 0; _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, _Attr, get()); - return static_cast::type>(__value); + return static_cast::type>(__value); } //! @overload template <::cudaDeviceAttr _Attr> _CCCL_NODISCARD auto attr() const { - return attr(__attr<_Attr>()); + return attr(detail::__dev_attr<_Attr>()); } }; diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu index 6f772de08a..b98d05fc3b 100644 --- a/cudax/test/device/device_smoke.cu +++ b/cudax/test/device/device_smoke.cu @@ -177,7 +177,7 @@ TEST_CASE("Smoke", "[device]") ::cudaGPUDirectRDMAWritesOrdering>(); ::test_device_attribute(); + ::cudaMemAllocationHandleType>(); ::test_device_attribute(); @@ -225,6 +225,30 @@ TEST_CASE("Smoke", "[device]") ordering == device::attrs::gpu_direct_rdma_writes_ordering.all_devices)); } + SECTION("memory_pool_supported_handle_types") + { + STATIC_REQUIRE(::cudaMemHandleTypeNone == device::attrs::memory_pool_supported_handle_types.none); + STATIC_REQUIRE(::cudaMemHandleTypePosixFileDescriptor + == device::attrs::memory_pool_supported_handle_types.posix_file_descriptor); + STATIC_REQUIRE(::cudaMemHandleTypeWin32 == device::attrs::memory_pool_supported_handle_types.win32); + STATIC_REQUIRE(::cudaMemHandleTypeWin32Kmt == device::attrs::memory_pool_supported_handle_types.win32_kmt); +#if CUDART_VERSION >= 12040 + STATIC_REQUIRE(::cudaMemHandleTypeFabric == 0x8); + STATIC_REQUIRE(::cudaMemHandleTypeFabric == device::attrs::memory_pool_supported_handle_types.fabric); +#else + STATIC_REQUIRE(0x8 == device::attrs::memory_pool_supported_handle_types.fabric); +#endif + + constexpr int all_handle_types = + device::attrs::memory_pool_supported_handle_types.none + | device::attrs::memory_pool_supported_handle_types.posix_file_descriptor + | device::attrs::memory_pool_supported_handle_types.win32 + | device::attrs::memory_pool_supported_handle_types.win32_kmt + | device::attrs::memory_pool_supported_handle_types.fabric; + auto handle_types = device_ref(0).attr(device::attrs::memory_pool_supported_handle_types); + CUDAX_REQUIRE(handle_types <= all_handle_types); + } + #if CUDART_VERSION >= 12020 SECTION("numa_config") { From b0e09d04b070447bc7c5e13ecd6c962b3c6773ca Mon Sep 17 00:00:00 2001 From: Bryan Van de Ven Date: Tue, 6 Aug 2024 20:15:38 -0700 Subject: [PATCH 06/33] Integrate Python docs (#2196) * pass docs build options to repo.sh * Integrate Python docs * update CI * Apply suggestions from code review Co-authored-by: Georgii Evtushenko --------- Co-authored-by: Georgii Evtushenko --- .github/actions/docs-build/action.yml | 2 + docs/cpp.rst | 52 +++++++++++++++++++++ docs/{pycuda => cuda_cooperative}/index.rst | 4 +- docs/gen_docs.bash | 7 +-- docs/index.rst | 27 ++++------- docs/python.rst | 15 ++++++ docs/repo.toml | 10 ++-- 7 files changed, 88 insertions(+), 29 deletions(-) create mode 100644 docs/cpp.rst rename docs/{pycuda => cuda_cooperative}/index.rst (89%) create mode 100644 docs/python.rst diff --git a/.github/actions/docs-build/action.yml b/.github/actions/docs-build/action.yml index 78af3d04a7..8b997f4741 100644 --- a/.github/actions/docs-build/action.yml +++ b/.github/actions/docs-build/action.yml @@ -36,6 +36,8 @@ runs: cp -rf ./docs/_build/docs/thrust/latest/* _site/thrust mkdir _site/cudax cp -rf ./docs/_build/docs/cudax/latest/* _site/cudax + mkdir _site/cuda_cooperative + cp -rf ./docs/_build/docs/cuda_cooperative/latest/* _site/cuda_cooperative ./docs/scrape_docs.bash ./_site # Update docs as workflow artifact: diff --git a/docs/cpp.rst b/docs/cpp.rst new file mode 100644 index 0000000000..453ab1e2f7 --- /dev/null +++ b/docs/cpp.rst @@ -0,0 +1,52 @@ +.. _cccl-cpp-libraries: + +CUDA C++ Core Libraries +======================= + +.. toctree:: + :hidden: + :maxdepth: 3 + + libcu++ + CUB + Thrust + Cuda Experimental + +Welcome to the CUDA Core Compute Libraries (CCCL) libraries for C++. + +The concept for the CCCL C++ librarires grew organically out of the Thrust, +CUB, and libcudacxx projects that were developed independently over the years +with a similar goal: to provide high-quality, high-performance, and +easy-to-use C++ abstractions for CUDA developers. Naturally, there was a lot +of overlap among the three projects, and it became clear the community would +be better served by unifying them into a single repository. + +- `libcu++ `__ + is the CUDA C++ Standard Library. It provides an implementation of the C++ + Standard Library that works in both host and device code. Additionally, it + provides abstractions for CUDA-specific hardware features like + synchronization primitives, cache control, atomics, and more. + +- `CUB `__ + is a lower-level, CUDA-specific library designed for speed-of-light parallel + algorithms across all GPU architectures. In addition to device-wide + algorithms, it provides *cooperative algorithms* like block-wide reduction + and warp-wide scan, providing CUDA kernel developers with building blocks to + create speed-of-light, custom kernels. + +- `Thrust `__ + is the C++ parallel algorithms library which inspired the introduction of + parallel algorithms to the C++ Standard Library. Thrust's high-level + interface greatly enhances programmer productivity while enabling performance + portability between GPUs and multicore CPUs via configurable backends that + allow using multiple parallel programming frameworks (such as CUDA, TBB, and + OpenMP). + +- `Cuda Experimental `__ + is a library of exerimental features that are still in the design process. + +The main goal of the CCCL C++ libraries is to fill a similar role that the +Standard C++ Library fills for Standard C++: provide general-purpose, +speed-of-light tools to CUDA C++ developers, allowing them to focus on +solving the problems that matter. Unifying these projects is the first step +towards realizing that goal. diff --git a/docs/pycuda/index.rst b/docs/cuda_cooperative/index.rst similarity index 89% rename from docs/pycuda/index.rst rename to docs/cuda_cooperative/index.rst index c9c0e227b8..0eb41f3c3d 100644 --- a/docs/pycuda/index.rst +++ b/docs/cuda_cooperative/index.rst @@ -1,6 +1,6 @@ -.. _pycudax-module: +.. _cuda_cooperative-module: -CUDA +CUDA Cooperative ================================================== .. warning:: diff --git a/docs/gen_docs.bash b/docs/gen_docs.bash index d879463741..bd2ddae6f8 100755 --- a/docs/gen_docs.bash +++ b/docs/gen_docs.bash @@ -1,7 +1,8 @@ #!/usr/bin/env bash -## This script just wraps launching a docs build within a container -## Tag is passed on as the first argument ${1} +# This script just wraps launching a repo docs build within a container +# +# Additional options, e.g --stage sphinx will be passed on to repo.sh set -e @@ -36,7 +37,7 @@ if [ ! -n "$(find img -name '*.png')" ]; then done fi -if ! ./repo.sh docs; then +if ! ./repo.sh docs "$@"; then echo "!!! There were errors while generating" exit 1 fi diff --git a/docs/index.rst b/docs/index.rst index a639b68b2b..1862ecb563 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,27 +1,16 @@ -CUDA C++ Core Libraries -======================= +CUDA Core Compute Libraries +=========================== .. toctree:: :hidden: :maxdepth: 3 - libcu++ - CUB - Thrust - Cuda Experimental + cpp + python -Welcome to the CUDA C++ Core Libraries (CCCL) where our mission is to make CUDA C++ more delightful. +Welcome to the CUDA Core Compute Libraries (CCCL) where our mission is to +make CUDA C++ and Python more delightful. -The concept for the CUDA C++ Core Libraries (CCCL) grew organically out of the Thrust, CUB, and libcudacxx projects that were developed independently over the years with a similar goal: to provide high-quality, high-performance, and easy-to-use C++ abstractions for CUDA developers. -Naturally, there was a lot of overlap among the three projects, and it became clear the community would be better served by unifying them into a single repository. +- :ref:`cccl-cpp-libraries` -- `libcu++ `__ is the CUDA C++ Standard Library. It provides an implementation of the C++ Standard Library that works in both host and device code. Additionally, it provides abstractions for CUDA-specific hardware features like synchronization primitives, cache control, atomics, and more. - -- `CUB `__ is a lower-level, CUDA-specific library designed for speed-of-light parallel algorithms across all GPU architectures. In addition to device-wide algorithms, it provides *cooperative algorithms* like block-wide reduction and warp-wide scan, providing CUDA kernel developers with building blocks to create speed-of-light, custom kernels. - -- `Thrust `__ is the C++ parallel algorithms library which inspired the introduction of parallel algorithms to the C++ Standard Library. Thrust's high-level interface greatly enhances programmer productivity while enabling performance portability between GPUs and multicore CPUs via configurable backends that allow using multiple parallel programming frameworks (such as CUDA, TBB, and OpenMP). - -- `Cuda Experimental `__ is a library of exerimental features that are still in the design process. - -The main goal of CCCL is to fill a similar role that the Standard C++ Library fills for Standard C++: provide general-purpose, speed-of-light tools to CUDA C++ developers, allowing them to focus on solving the problems that matter. -Unifying these projects is the first step towards realizing that goal. +- :ref:`cccl-python-libraries` diff --git a/docs/python.rst b/docs/python.rst new file mode 100644 index 0000000000..b0b9c5b73f --- /dev/null +++ b/docs/python.rst @@ -0,0 +1,15 @@ +.. _cccl-python-libraries: + +CUDA Python Core Libraries +========================== + +.. toctree:: + :hidden: + :maxdepth: 3 + + cuda.cooperative + +Welcome to the CUDA Core Compute Libraries (CCCL) libraries for Python. + +- `cuda.cooperative `__ + is a still-experimental library exposing cooperative algorithms to Python. diff --git a/docs/repo.toml b/docs/repo.toml index 74ebb0be7d..0741089ceb 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -25,14 +25,14 @@ sphinx_exclude_patterns = [ "VERSION.md", ] -project_build_order = [ "libcudacxx", "cudax", "cub", "thrust", "cccl", "pycuda" ] +project_build_order = [ "libcudacxx", "cudax", "cub", "thrust", "cccl", "cuda_cooperative" ] # deps can be used to link to other projects' documentation deps = [ [ "libcudacxx", "_build/docs/libcudacxx/latest" ], [ "cub", "_build/docs/cub/latest" ], [ "thrust", "_build/docs/thrust/latest" ], - [ "pycuda", "_build/docs/pycuda/latest" ], + [ "cuda_cooperative", "_build/docs/cuda_cooperative/latest" ], ] [repo_docs.projects.libcudacxx] @@ -281,9 +281,9 @@ doxygen_conf_extra = """ STRIP_FROM_PATH = ../../thrust """ -[repo_docs.projects.pycuda] -name = "pycuda" -docs_root = "pycuda" +[repo_docs.projects.cuda_cooperative] +name = "cuda.cooperative" +docs_root = "cuda_cooperative" logo = "../img/logo.png" repo_url = "https://github.com/NVIDIA/cccl/python/cuda" From 62336adbce4ee527a7bcd3b086b649822f8c95d4 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Wed, 7 Aug 2024 01:24:30 -0700 Subject: [PATCH 07/33] [FEA] Atomics codegen refactor (#1993) * Initial draft of new atomics backend * Change atomic fetch ops back to tag dispatch * Save wip * Add load/store and support for MMIO * Begin working on exch * Enable formatting exchange * Several signed-ness fixes * Make atomics ptx tests build. Lit tests are a WIP. * Fix load/store, some volatileness, and min/max * Formatting and enabled codegen in all builds * Make integral.pass.cpp pass * Make the rest of the atomics tests pass * Use 128b ld/st instead of vector load as it is not atomic across the whole atom * Fix copy-paste mistake in load/store * Whitespace fixup * Fix 128b .exch using .cas operands * Make codegen link fmt as PRIVATE Co-authored-by: Allison Piper * Simplify MMIO down to a static array. Co-authored-by: Bernhard Manfred Gruber * Static -> Inline for codegen functions. Replace endl with '\n'. * Supply the output stream directly to `fmt::format` * Update fmtlib. * Revert `fmt::format(out...)` changes. They don't work on MSVC. * Fixup libcudacxx codegen CMake stuff * Remove sneaky cstdef include that was auto-added * [pre-commit.ci] auto code formatting --------- Co-authored-by: Allison Piper Co-authored-by: Bernhard Manfred Gruber Co-authored-by: Michael Schellenberger Costa Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CMakePresets.json | 4 +- libcudacxx/CMakeLists.txt | 2 +- libcudacxx/codegen/CMakeLists.txt | 28 +- libcudacxx/codegen/codegen.cpp | 527 +- .../codegen/generators/compare_and_swap.h | 175 + libcudacxx/codegen/generators/definitions.h | 193 + libcudacxx/codegen/generators/exchange.h | 173 + libcudacxx/codegen/generators/fence.h | 110 + libcudacxx/codegen/generators/fetch_ops.h | 217 + libcudacxx/codegen/generators/header.h | 80 + libcudacxx/codegen/generators/ld_st.h | 353 + .../cuda/std/__atomic/functions/common.h | 54 + .../std/__atomic/functions/cuda_ptx_derived.h | 514 +- .../__atomic/functions/cuda_ptx_generated.h | 10729 ++++++---------- .../functions/cuda_ptx_generated_helper.h | 155 + .../cuda/std/__atomic/functions/host.h | 25 +- libcudacxx/include/cuda/std/__atomic/scopes.h | 2 + libcudacxx/test/atomic_codegen/CMakeLists.txt | 8 +- .../atomic_codegen/atomic_add_non_volatile.cu | 6 +- .../atomic_store_non_volatile.cu | 2 +- .../atomic_codegen/atomic_sub_non_volatile.cu | 6 +- 21 files changed, 5876 insertions(+), 7487 deletions(-) create mode 100644 libcudacxx/codegen/generators/compare_and_swap.h create mode 100644 libcudacxx/codegen/generators/definitions.h create mode 100644 libcudacxx/codegen/generators/exchange.h create mode 100644 libcudacxx/codegen/generators/fence.h create mode 100644 libcudacxx/codegen/generators/fetch_ops.h create mode 100644 libcudacxx/codegen/generators/header.h create mode 100644 libcudacxx/codegen/generators/ld_st.h create mode 100644 libcudacxx/include/cuda/std/__atomic/functions/common.h create mode 100644 libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h diff --git a/CMakePresets.json b/CMakePresets.json index 004d57b2ba..61cb88eca8 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -478,7 +478,7 @@ ], "filter": { "exclude": { - "name": "^libcudacxx\\.test\\.lit$" + "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.diff)$" } } }, @@ -487,7 +487,7 @@ "configurePreset": "libcudacxx-codegen", "filter": { "include": { - "name": "^libcudacxx\\.atomics\\.codegen.*$" + "name": "^libcudacxx\\.test\\.atomics\\.codegen.*$" } } }, diff --git a/libcudacxx/CMakeLists.txt b/libcudacxx/CMakeLists.txt index 989fd642f0..7883110573 100644 --- a/libcudacxx/CMakeLists.txt +++ b/libcudacxx/CMakeLists.txt @@ -33,7 +33,7 @@ include(CTest) enable_testing() # Add codegen module -option(libcudacxx_ENABLE_CODEGEN "Enable ctest-based testing." OFF) +option(libcudacxx_ENABLE_CODEGEN "Enable libcudacxx's atomics backend codegen and tests." OFF) if (libcudacxx_ENABLE_CODEGEN) add_subdirectory(codegen) endif() diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt index 3477f988af..05821a4eca 100644 --- a/libcudacxx/codegen/CMakeLists.txt +++ b/libcudacxx/codegen/CMakeLists.txt @@ -1,11 +1,11 @@ ## Codegen adds the following build targets # libcudacxx.atomics.codegen -# libcudacxx.atomics.codegen.execute # libcudacxx.atomics.codegen.install ## Test targets: -# libcudacxx.atomics.codegen.diff +# libcudacxx.test.atomics.codegen.diff -add_custom_target(libcudacxx.atomics.codegen) +include(${CMAKE_SOURCE_DIR}/cub/cmake/CPM.cmake) +CPMAddPackage("gh:fmtlib/fmt#11.0.1") add_executable( codegen @@ -13,32 +13,32 @@ add_executable( codegen.cpp ) -target_compile_features( - codegen PRIVATE cxx_std_14 -) +target_link_libraries(codegen PRIVATE fmt) -add_dependencies(libcudacxx.atomics.codegen codegen) +set_property(TARGET codegen PROPERTY CXX_STANDARD 17) set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h") set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/functions") add_custom_target( - libcudacxx.atomics.codegen.execute - COMMAND codegen + libcudacxx.atomics.codegen + COMMAND codegen "${atomic_generated_output}" BYPRODUCTS "${atomic_generated_output}" ) -add_dependencies(libcudacxx.atomics.codegen libcudacxx.atomics.codegen.execute) - add_custom_target( libcudacxx.atomics.codegen.install COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/cuda_ptx_generated.h" + DEPENDS libcudacxx.atomics.codegen BYPRODUCTS "${atomic_install_location}/cuda_ptx_generated.h" ) -add_dependencies(libcudacxx.atomics.codegen.install libcudacxx.atomics.codegen.execute) - add_test( - NAME libcudacxx.atomics.codegen.diff + NAME libcudacxx.test.atomics.codegen.diff COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/cuda_ptx_generated.h" "${atomic_generated_output}" ) + +set_tests_properties( + libcudacxx.test.atomics.codegen.diff + PROPERTIES REQUIRED_FILES "${atomic_generated_output}" +) diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp index 0d4a7a7296..5a5e36454f 100644 --- a/libcudacxx/codegen/codegen.cpp +++ b/libcudacxx/codegen/codegen.cpp @@ -4,521 +4,42 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// #include -#include -#include -#include +#include +#include + +#include "generators/compare_and_swap.h" +#include "generators/exchange.h" +#include "generators/fence.h" +#include "generators/fetch_ops.h" +#include "generators/header.h" +#include "generators/ld_st.h" using namespace std::string_literals; -int main() +int main(int argc, char** argv) { - std::map scopes{{"system", ".sys"}, {"device", ".gpu"}, {"block", ".cta"}}; - - std::map membar_scopes{{"system", ".sys"}, {"device", ".gl"}, {"block", ".cta"}}; - - std::map fence_semantics{{"sc", ".sc"}, {"acq_rel", ".acq_rel"}}; - - bool const ld_as_atom = false; - - std::vector ld_sizes{ - // 8, - // 16, - 32, - 64}; - std::map ld_semantics{ - {"relaxed", ".relaxed"}, {"acquire", ".acquire"}, {"volatile", ".volatile"}}; - - std::vector st_sizes{ - // 8, - // 16, - 32, - 64}; - std::map st_semantics{ - {"relaxed", ".relaxed"}, {"release", ".release"}, {"volatile", ".volatile"}}; - - std::vector rmw_sizes{32, 64}; - std::map rmw_semantics{ - {"relaxed", ".relaxed"}, - {"acquire", ".acquire"}, - {"release", ".release"}, - {"acq_rel", ".acq_rel"}, - {"volatile", ""}}; - std::vector rmw_classes{"bitwise", "arithmetic"}; - std::map> rmw_operations{ - {"bitwise", std::map{{"fetch_and", ".and"}, {"fetch_or", ".or"}, {"fetch_xor", ".xor"}}}, - {"arithmetic", - std::map{ - {"exchange", ".exch"}, - {"compare_exchange", ".cas"}, - {"fetch_add", ".add"}, - {"fetch_sub", ".add"}, - {"fetch_max", ".max"}, - {"fetch_min", ".min"}}}}; - std::map> rmw_types{ - {"bitwise", std::map{{"", ".b"}}}, - {"arithmetic", std::map{{"u", ".u"}, {"s", ".s"}, {"f", ".f"}}}}; - - std::vector cv_qualifier{"volatile ", ""}; + std::fstream filestream; - std::ofstream out("cuda_ptx_generated.h"); - - out << R"XXX(//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate -// clang-format off - -#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H -#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include - -#include -#include -#include - -#include -#include - -_LIBCUDACXX_BEGIN_NAMESPACE_STD - -#if defined(_CCCL_CUDA_COMPILER) - -)XXX"; - - auto scopenametag = [&](auto scope) { - return "__thread_scope_" + scope + "_tag"; - }; - auto fencename = [&](auto sem, auto scope) { - return "__cuda_fence_" + sem + "_" + scope; - }; - auto registers = [&](auto type_literal, auto type_size) { - if (type_literal == "f") - { - return (type_size == 32) ? "f" : "d"; - } - else - { - return (type_size == 32) ? "r" : "l"; - } - }; - - for (auto& s : scopes) + if (argc == 2) { - out << "static inline _CCCL_DEVICE void __cuda_membar_" << s.first << "() { asm volatile(\"membar" - << membar_scopes[s.first] << ";\":::\"memory\"); }\n"; - for (auto& sem : fence_semantics) - { - out << "static inline _CCCL_DEVICE void " << fencename(sem.first, s.first) << "() { asm volatile(\"fence" - << sem.second << s.second << ";\":::\"memory\"); }\n"; - } - out << "static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, " << scopenametag(s.first) - << ") {\n"; - out << " NV_DISPATCH_TARGET(\n"; - out << " NV_PROVIDES_SM_70, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); break;\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_RELEASE: " << fencename("acq_rel"s, s.first) << "(); break;\n"; - out << " case __ATOMIC_RELAXED: break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " ),\n"; - out << " NV_IS_DEVICE, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELAXED: break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " )\n"; - out << " )\n"; - out << "}\n"; - for (auto& sz : ld_sizes) - { - for (auto& sem : ld_semantics) - { - out << "template "; - out << "static inline _CCCL_DEVICE void __cuda_load_" << sem.first << "_" << sz << "_" << s.first - << "(_CUDA_A __ptr, _CUDA_B& __dst) {"; - if (ld_as_atom) - { - out << "asm volatile(\"atom.add" << (sem.first == "volatile" ? "" : sem.second.c_str()) << s.second << ".u" - << sz << " %0, [%1], 0;\" : "; - } - else - { - out << "asm volatile(\"ld" << sem.second << (sem.first == "volatile" ? "" : s.second.c_str()) << ".b" << sz - << " %0,[%1];\" : "; - } - out << "\"=" << registers("b", sz) << "\"(__dst) : \"l\"(__ptr)"; - out << " : \"memory\"); }\n"; - } - for (auto& cv : cv_qualifier) - { - out << "template = 0>\n"; - out << "_CCCL_DEVICE void __atomic_load_cuda(const " << cv << "_Type *__ptr, _Type *__ret, int __memorder, " - << scopenametag(s.first) << ") {\n"; - out << " uint" << sz << "_t __tmp = 0;\n"; - out << " NV_DISPATCH_TARGET(\n"; - out << " NV_PROVIDES_SM_70, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_load_acquire_" << sz << "_" << s.first - << "(__ptr, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_load_relaxed_" << sz << "_" << s.first - << "(__ptr, __tmp); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " ),\n"; - out << " NV_IS_DEVICE, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_load_volatile_" << sz << "_" << s.first - << "(__ptr, __tmp); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_load_volatile_" << sz << "_" << s.first - << "(__ptr, __tmp); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " )\n"; - out << " )\n"; - out << " memcpy(__ret, &__tmp, " << sz / 8 << ");\n"; - out << "}\n"; - } - } - for (auto& sz : st_sizes) - { - for (auto& sem : st_semantics) - { - out << "template "; - out << "static inline _CCCL_DEVICE void __cuda_store_" << sem.first << "_" << sz << "_" << s.first - << "(_CUDA_A __ptr, _CUDA_B __src) { "; - out << "asm volatile(\"st" << sem.second << (sem.first == "volatile" ? "" : s.second.c_str()) << ".b" << sz - << " [%0], %1;\" :: "; - out << "\"l\"(__ptr),\"" << registers("b", sz) << "\"(__src)"; - out << " : \"memory\"); }\n"; - } - for (auto& cv : cv_qualifier) - { - out << "template = 0>\n"; - out << "_CCCL_DEVICE void __atomic_store_cuda(" << cv << "_Type *__ptr, _Type *__val, int __memorder, " - << scopenametag(s.first) << ") {\n"; - out << " uint" << sz << "_t __tmp = 0;\n"; - out << " memcpy(&__tmp, __val, " << sz / 8 << ");\n"; - out << " NV_DISPATCH_TARGET(\n"; - out << " NV_PROVIDES_SM_70, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_RELEASE: __cuda_store_release_" << sz << "_" << s.first - << "(__ptr, __tmp); break;\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_RELAXED: __cuda_store_relaxed_" << sz << "_" << s.first - << "(__ptr, __tmp); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " ),\n"; - out << " NV_IS_DEVICE, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_RELAXED: __cuda_store_volatile_" << sz << "_" << s.first - << "(__ptr, __tmp); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " )\n"; - out << " )\n"; - out << "}\n"; - } - } - for (auto& sz : rmw_sizes) - { - for (auto& cl : rmw_classes) - { - for (auto& rmw : rmw_operations[cl]) - { - for (auto& type : rmw_types[cl]) - { - // fetch_min/fetch_max for fp types are derived functions - if (type.first == "f" && (rmw.first == "fetch_max" || rmw.first == "fetch_min")) - { - continue; - } - if (type.first == "s" - && (rmw.first == "fetch_add" || rmw.first == "fetch_sub" || rmw.first == "compare_exchange" - || rmw.first == "exchange")) - { - continue; - } - for (auto& sem : rmw_semantics) - { - if (rmw.first == "compare_exchange") - { - out << "template "; - } - else - { - out << "template "; - } - out << "static inline _CCCL_DEVICE void __cuda_" << rmw.first << "_" << sem.first << "_" << type.first - << sz << "_" << s.first << "("; - if (rmw.first == "compare_exchange") - { - out << "_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op"; - } - else - { - out << "_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op"; - } - out << ") { "; - if (rmw.first == "fetch_sub") - { - out << "__op = -__op;" << std::endl; - } - if (rmw.first == "compare_exchange") - { - out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << ".b" << sz << " "; - out << "%0,[%1],%2,%3"; - } - else if (rmw.first == "exchange") - { - out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << ".b" << sz << " "; - out << "%0,[%1],%2"; - } - else - { - out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << type.second << sz << " "; - out << "%0,[%1],%2"; - } - out << ";\" : "; - if (rmw.first == "compare_exchange") - { - out << "\"=" << registers(type.first, sz) << "\"(__dst) : \"l\"(__ptr),\"" << registers(type.first, sz) - << "\"(__cmp),\"" << registers(type.first, sz) << "\"(__op)"; - } - else - { - out << "\"=" << registers(type.first, sz) << "\"(__dst) : \"l\"(__ptr),\"" << registers(type.first, sz) - << "\"(__op)"; - } - out << " : \"memory\"); }\n"; - } - for (auto& cv : cv_qualifier) - { - out << "template = 0>\n"; - } - else if (rmw.first == "fetch_max" || rmw.first == "fetch_min") - { - if (type.first == "u") - { - out << " && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> " - "= 0>\n"; - } - else if (type.first == "s") - { - out << " && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = " - "0>\n"; - } - } - else if (type.first == "u") - { - out << " && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>\n"; - } - else - { - out << ", int> = 0>\n"; - } - if (rmw.first == "compare_exchange") - { - out << "_CCCL_DEVICE bool __atomic_compare_exchange_cuda(" << cv - << "void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int " - "__failure_memorder, " - << scopenametag(s.first) << ") {\n"; - out << " auto __old = *__expected;\n"; - out << " NV_DISPATCH_TARGET(\n"; - out << " NV_PROVIDES_SM_70, (\n"; - out << " switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_" << type.first << sz << "_" - << s.first << "(__ptr, *__expected, __old, __desired); break;\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_" << type.first << sz << "_" - << s.first << "(__ptr, *__expected, __old, __desired); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_compare_exchange_release_" << type.first << sz << "_" - << s.first << "(__ptr, *__expected, __old, __desired); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_" << type.first << sz << "_" - << s.first << "(__ptr, *__expected, __old, __desired); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " ),\n"; - out << " NV_IS_DEVICE, (\n"; - out << " switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n"; - out << " case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_" << type.first << sz << "_" - << s.first << "(__ptr, *__expected, __old, __desired); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first - << "(); __cuda_compare_exchange_volatile_" << type.first << sz << "_" << s.first - << "(__ptr, *__expected, __old, __desired); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_" << type.first << sz << "_" - << s.first << "(__ptr, *__expected, __old, __desired); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " )\n"; - out << " )\n"; - out << " return (__old == *__expected);\n"; - out << "}\n"; - } - else - { - if (rmw.first == "exchange") - { - out - << "_CCCL_DEVICE void __atomic_exchange_cuda(" << cv - << "void *__ptr, _Type *__val, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n"; - out << " _Type __tmp = *__val;\n"; - } - else - { - out << "_CCCL_DEVICE _Type __atomic_" << rmw.first << "_cuda(" << cv - << "_Type *__ptr, _Type __val, int __memorder, " << scopenametag(s.first) << ") {\n"; - out << " _Type __tmp = __val;\n"; - } - out << " NV_DISPATCH_TARGET(\n"; - out << " NV_PROVIDES_SM_70, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_acquire_" << type.first << sz << "_" - << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_" << rmw.first << "_acq_rel_" << type.first << sz << "_" - << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_" << rmw.first << "_release_" << type.first << sz << "_" - << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_relaxed_" << type.first << sz << "_" - << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " ),\n"; - out << " NV_IS_DEVICE, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_volatile_" << type.first << sz - << "_" << s.first << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_" << rmw.first - << "_volatile_" << type.first << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_volatile_" << type.first << sz - << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " )\n"; - out << " )\n"; - if (rmw.first == "exchange") - { - out << " memcpy(__ret, &__tmp, " << sz / 8 << ");\n"; - } - else - { - out << " return __tmp;\n"; - } - out << "}\n"; - } - } - } - } - } - } - for (auto& cv : cv_qualifier) - { - std::vector addsub{"add", "sub"}; - for (auto& op : addsub) - { - out << "template\n"; - out << "_CCCL_DEVICE _Type* __atomic_fetch_" << op << "_cuda(_Type *" << cv - << "*__ptr, ptrdiff_t __val, int __memorder, " << scopenametag(s.first) << ") {\n"; - out << " _Type* __ret;\n"; - out << " uint64_t __tmp = 0;\n"; - out << " memcpy(&__tmp, &__val, 8);\n"; - if (op == "sub") - { - out << " __tmp = -__tmp;\n"; - } - out << " __tmp *= sizeof(_Type);\n"; - out << " NV_DISPATCH_TARGET(\n"; - out << " NV_PROVIDES_SM_70, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_" << s.first - << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_" << s.first - << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_" << s.first - << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_" << s.first - << "(__ptr, __tmp, __tmp); break;\n"; - out << " }\n"; - out << " ),\n"; - out << " NV_IS_DEVICE, (\n"; - out << " switch (__memorder) {\n"; - out << " case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_" << s.first - << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_fetch_add_volatile_u64_" - << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_" << s.first - << "(__ptr, __tmp, __tmp); break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; - out << " )\n"; - out << " )\n"; - out << " memcpy(&__ret, &__tmp, 8);\n"; - out << " return __ret;\n"; - out << "}\n"; - } - } + filestream.open(argv[1], filestream.out); } - out << "\n#endif // defined(_CCCL_CUDA_COMPILER)\n"; - out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n"; - out << "\n#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H\n"; - out << "\n// clang-format on\n"; + std::ostream& stream = filestream.is_open() ? filestream : std::cout; + + FormatHeader(stream); + FormatFence(stream); + FormatLoad(stream); + FormatStore(stream); + FormatCompareAndSwap(stream); + FormatExchange(stream); + FormatFetchOps(stream); + FormatTail(stream); return 0; } diff --git a/libcudacxx/codegen/generators/compare_and_swap.h b/libcudacxx/codegen/generators/compare_and_swap.h new file mode 100644 index 0000000000..5a970735c0 --- /dev/null +++ b/libcudacxx/codegen/generators/compare_and_swap.h @@ -0,0 +1,175 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef COMPARED_AND_SWAP_H +#define COMPARED_AND_SWAP_H + +#include + +#include "definitions.h" +#include + +inline void FormatCompareAndSwap(std::ostream& out) +{ + out << R"XXX( +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch(_Fn& __cuda_cas, int __success_memorder, int __failure_memorder, _Sco) { + bool __res = false; + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_acquire{}); break; + case __ATOMIC_ACQ_REL: __res = __cuda_cas(__atomic_cuda_acq_rel{}); break; + case __ATOMIC_RELEASE: __res = __cuda_cas(__atomic_cuda_release{}); break; + case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __res = __cuda_cas(__atomic_cuda_volatile{}); break; + case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_volatile{}); break; + default: assert(0); + } + ) + ) + return __res; +} +)XXX"; + + // Argument ID Reference + // 0 - Operand Type + // 1 - Operand Size + // 2 - Type Constraint + // 3 - Memory Order + // 4 - Memory Order function tag + // 5 - Scope Constraint + // 6 - Scope function tag + const std::string asm_intrinsic_format_128 = R"XXX( +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, {4}, __atomic_cuda_operand_{0}{1}, {6}) +{{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {{%0, %1}}, _d; +mov.b128 {{%4, %5}}, _v; +atom.cas{3}{5}.b128 _d,[%2],_d,_v; +mov.b128 _d, {{%0, %1}}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }})XXX"; + + const std::string asm_intrinsic_format = R"XXX( +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, {4}, __atomic_cuda_operand_{0}{1}, {6}) +{{ asm volatile("atom.cas{3}{5}.{0}{1} %0,[%1],%2,%3;" : "={2}"(__dst) : "l"(__ptr), "{2}"(__cmp), "{2}"(__op) : "memory"); return __dst == __cmp; }})XXX"; + + constexpr Operand supported_types[] = { + Operand::Bit, + }; + + constexpr size_t supported_sizes[] = { + 16, + 32, + 64, + 128, + }; + + constexpr Semantic supported_semantics[] = { + Semantic::Acquire, + Semantic::Relaxed, + Semantic::Release, + Semantic::Acq_Rel, + Semantic::Volatile, + }; + + constexpr Scope supported_scopes[] = { + Scope::CTA, + Scope::Cluster, + Scope::GPU, + Scope::System, + }; + + for (auto size : supported_sizes) + { + for (auto type : supported_types) + { + for (auto sem : supported_semantics) + { + for (auto sco : supported_scopes) + { + if (size == 2 && type != Operand::Bit) + { + continue; + } + if (size == 128 && type != Operand::Bit) + { + continue; + } + out << fmt::format( + (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format, + operand(type), + size, + constraints(type, size), + semantic(sem), + semantic_tag(sem), + scope(sco), + scope_tag(sco)); + } + } + } + } + + out << "\n" + << R"XXX( +template +struct __cuda_atomic_bind_compare_exchange { + _Type* __ptr; + _Type* __exp; + _Type* __des; + + template + inline _CCCL_DEVICE bool operator()(_Atomic_Memorder) { + return __cuda_atomic_compare_exchange(__ptr, *__exp, *__exp, *__des, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template +static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy}; + return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type volatile* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy}; + return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{}); +} +)XXX"; +} + +#endif // COMPARED_AND_SWAP_H diff --git a/libcudacxx/codegen/generators/definitions.h b/libcudacxx/codegen/generators/definitions.h new file mode 100644 index 0000000000..0944a7e78a --- /dev/null +++ b/libcudacxx/codegen/generators/definitions.h @@ -0,0 +1,193 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef DEFINITIONS_H +#define DEFINITIONS_H + +#include +#include +#include +#include + +#include + +enum class Mmio +{ + Disabled, + Enabled, +}; + +inline std::string mmio(Mmio m) +{ + static const char* mmio_map[]{ + "", + ".mmio", + }; + return mmio_map[std::underlying_type_t(m)]; +} + +inline std::string mmio_tag(Mmio m) +{ + static const char* mmio_map[]{ + "__atomic_cuda_mmio_disable", + "__atomic_cuda_mmio_enable", + }; + return mmio_map[std::underlying_type_t(m)]; +} + +enum class Operand +{ + Floating, + Unsigned, + Signed, + Bit, +}; + +inline std::string operand(Operand op) +{ + static std::map op_map = { + std::pair{Operand::Floating, "f"}, + std::pair{Operand::Unsigned, "u"}, + std::pair{Operand::Signed, "s"}, + std::pair{Operand::Bit, "b"}, + }; + return op_map[op]; +} + +inline std::string operand_proxy_type(Operand op, size_t sz) +{ + if (op == Operand::Floating) + { + if (sz == 32) + { + return {"float"}; + } + else + { + return {"double"}; + } + } + else if (op == Operand::Signed) + { + return fmt::format("int{}_t", sz); + } + // Binary and unsigned can be the same proxy_type + return fmt::format("uint{}_t", sz); +} + +inline std::string constraints(Operand op, size_t sz) +{ + static std::map constraint_map = { + std::pair{32, + std::map{ + std::pair{Operand::Bit, "r"}, + std::pair{Operand::Unsigned, "r"}, + std::pair{Operand::Signed, "r"}, + std::pair{Operand::Floating, "f"}, + }}, + std::pair{64, + std::map{ + std::pair{Operand::Bit, "l"}, + std::pair{Operand::Unsigned, "l"}, + std::pair{Operand::Signed, "l"}, + std::pair{Operand::Floating, "d"}, + }}, + std::pair{128, + std::map{ + std::pair{Operand::Bit, "l"}, + std::pair{Operand::Unsigned, "l"}, + std::pair{Operand::Signed, "l"}, + std::pair{Operand::Floating, "d"}, + }}, + }; + + if (sz == 16) + { + return {"h"}; + } + else + { + return constraint_map[sz][op]; + } +} + +enum class Semantic +{ + Relaxed, + Release, + Acquire, + Acq_Rel, + Seq_Cst, + Volatile, +}; + +inline std::string semantic(Semantic sem) +{ + static std::map sem_map = { + std::pair{Semantic::Relaxed, ".relaxed"}, + std::pair{Semantic::Release, ".release"}, + std::pair{Semantic::Acquire, ".acquire"}, + std::pair{Semantic::Acq_Rel, ".acq_rel"}, + std::pair{Semantic::Seq_Cst, ".sc"}, + std::pair{Semantic::Volatile, ""}, + }; + return sem_map[sem]; +} + +inline std::string semantic_tag(Semantic sem) +{ + static std::map sem_map = { + std::pair{Semantic::Relaxed, "__atomic_cuda_relaxed"}, + std::pair{Semantic::Release, "__atomic_cuda_release"}, + std::pair{Semantic::Acquire, "__atomic_cuda_acquire"}, + std::pair{Semantic::Acq_Rel, "__atomic_cuda_acq_rel"}, + std::pair{Semantic::Seq_Cst, "__atomic_cuda_seq_cst"}, + std::pair{Semantic::Volatile, "__atomic_cuda_volatile"}, + }; + return sem_map[sem]; +} + +enum class Scope +{ + Thread, + Warp, + CTA, + Cluster, + GPU, + System, +}; + +inline std::string scope(Scope sco) +{ + static std::map sco_map = { + std::pair{Scope::Thread, ""}, + std::pair{Scope::Warp, ""}, + std::pair{Scope::CTA, ".cta"}, + std::pair{Scope::Cluster, ".cluster"}, + std::pair{Scope::GPU, ".gpu"}, + std::pair{Scope::System, ".sys"}, + }; + return sco_map[sco]; +} + +inline std::string scope_tag(Scope sco) +{ + static std::map sco_map = { + std::pair{Scope::Thread, "__thread_scope_thread_tag"}, + std::pair{Scope::Warp, ""}, + std::pair{Scope::CTA, "__thread_scope_block_tag"}, + std::pair{Scope::Cluster, "__thread_scope_cluster_tag"}, + std::pair{Scope::GPU, "__thread_scope_device_tag"}, + std::pair{Scope::System, "__thread_scope_system_tag"}, + }; + return sco_map[sco]; +} + +#endif // DEFINITIONS_H diff --git a/libcudacxx/codegen/generators/exchange.h b/libcudacxx/codegen/generators/exchange.h new file mode 100644 index 0000000000..dcfe66f147 --- /dev/null +++ b/libcudacxx/codegen/generators/exchange.h @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef EXCHANGE_H +#define EXCHANGE_H + +#include + +#include "definitions.h" +#include + +inline void FormatExchange(std::ostream& out) +{ + out << R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn& __cuda_exch, int __memorder, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_acquire{}); break; + case __ATOMIC_ACQ_REL: __cuda_exch(__atomic_cuda_acq_rel{}); break; + case __ATOMIC_RELEASE: __cuda_exch(__atomic_cuda_release{}); break; + case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_exch(__atomic_cuda_volatile{}); break; + case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_volatile{}); break; + default: assert(0); + } + ) + ) +} +)XXX"; + + // Argument ID Reference + // 0 - Operand Type + // 1 - Operand Size + // 2 - Type Constraint + // 3 - Memory Order + // 4 - Memory Order function tag + // 5 - Scope Constraint + // 6 - Scope function tag + const std::string asm_intrinsic_format_128 = R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, {4}, __atomic_cuda_operand_{0}{1}, {6}) +{{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {{%3, %4}}, _v; + atom.exch{3}{5}.b128 _d,[%2],_v; + mov.b128 _d, {{%0, %1}}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +}})XXX"; + + const std::string asm_intrinsic_format = R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, {4}, __atomic_cuda_operand_{0}{1}, {6}) +{{ asm volatile("atom.exch{3}{5}.{0}{1} %0,[%1],%2;" : "={2}"(__old) : "l"(__ptr), "{2}"(__new) : "memory"); }})XXX"; + + constexpr Operand supported_types[] = { + Operand::Bit, + }; + + constexpr size_t supported_sizes[] = { + 16, + 32, + 64, + 128, + }; + + constexpr Semantic supported_semantics[] = { + Semantic::Acquire, + Semantic::Relaxed, + Semantic::Release, + Semantic::Acq_Rel, + Semantic::Volatile, + }; + + constexpr Scope supported_scopes[] = { + Scope::CTA, + Scope::Cluster, + Scope::GPU, + Scope::System, + }; + + for (auto size : supported_sizes) + { + for (auto type : supported_types) + { + for (auto sem : supported_semantics) + { + for (auto sco : supported_scopes) + { + if (size == 2 && type != Operand::Bit) + { + continue; + } + if (size == 128 && type != Operand::Bit) + { + continue; + } + out << fmt::format( + (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format, + operand(type), + size, + constraints(type, size), + semantic(sem), + semantic_tag(sem), + scope(sco), + scope_tag(sco)); + } + } + } + } + + out << "\n" + << R"XXX( +template +struct __cuda_atomic_bind_exchange { + _Type* __ptr; + _Type* __old; + _Type* __new; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_exchange(__ptr, *__old, *__new, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template +static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __old, _Type __new, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy}; + __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type volatile* __ptr, _Type& __old, _Type __new, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy}; + __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{}); +} +)XXX"; +} + +#endif // EXCHANGE_H diff --git a/libcudacxx/codegen/generators/fence.h b/libcudacxx/codegen/generators/fence.h new file mode 100644 index 0000000000..073264b7e3 --- /dev/null +++ b/libcudacxx/codegen/generators/fence.h @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef FENCE_H +#define FENCE_H + +#include + +#include "definitions.h" +#include + +inline std::string membar_scope(Scope sco) +{ + static std::map scope_map{ + std::pair{Scope::GPU, ".gl"}, + std::pair{Scope::System, ".sys"}, + std::pair{Scope::CTA, ".cta"}, + }; + + return scope_map[sco]; +} + +inline void FormatFence(std::ostream& out) +{ + // Argument ID Reference + // 0 - Membar scope tag + // 1 - Membar scope + const std::string intrinsic_membar = R"XXX( +static inline _CCCL_DEVICE void __cuda_atomic_membar({0}) +{{ asm volatile("membar{1};" ::: "memory"); }})XXX"; + + const std::map membar_scopes{ + std::pair{Scope::GPU, ".gl"}, + std::pair{Scope::System, ".sys"}, + std::pair{Scope::CTA, ".cta"}, + }; + + for (const auto& sco : membar_scopes) + { + out << fmt::format(intrinsic_membar, scope_tag(sco.first), sco.second); + } + + // Argument ID Reference + // 0 - Fence scope tag + // 1 - Fence scope + // 2 - Fence order tag + // 3 - Fence order + const std::string intrinsic_fence = R"XXX( +static inline _CCCL_DEVICE void __cuda_atomic_fence({0}, {2}) +{{ asm volatile("fence{1}{3};" ::: "memory"); }})XXX"; + + const Scope fence_scopes[] = { + Scope::CTA, + Scope::Cluster, + Scope::GPU, + Scope::System, + }; + + const Semantic fence_semantics[] = { + Semantic::Acq_Rel, + Semantic::Seq_Cst, + }; + + for (const auto& sco : fence_scopes) + { + for (const auto& sem : fence_semantics) + { + out << fmt::format(intrinsic_fence, scope_tag(sco), scope(sco), semantic_tag(sem), semantic(sem)); + } + } + out << "\n" + << R"XXX( +template +static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); break; + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); + case __ATOMIC_RELEASE: __cuda_atomic_fence(_Sco{}, __atomic_cuda_acq_rel{}); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ) + ) +} +)XXX"; +} + +#endif // FENCE_H diff --git a/libcudacxx/codegen/generators/fetch_ops.h b/libcudacxx/codegen/generators/fetch_ops.h new file mode 100644 index 0000000000..8ce48b5e78 --- /dev/null +++ b/libcudacxx/codegen/generators/fetch_ops.h @@ -0,0 +1,217 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef FETCH_OPS_H +#define FETCH_OPS_H + +#include +#include + +#include "definitions.h" +#include + +inline std::string fetch_op_skip_v(std::string fetch_op) +{ + if (fetch_op == "add") + { + return "constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;"; + } + return "constexpr auto __skip_v = 1;"; +} + +inline void FormatFetchOps(std::ostream& out) +{ + const std::vector arithmetic_types = { + Operand::Floating, + Operand::Unsigned, + Operand::Signed, + }; + + const std::vector minmax_types = { + Operand::Unsigned, + Operand::Signed, + }; + + const std::vector bitwise_types = {Operand::Bit}; + + const std::map op_support_map{ + std::pair{std::string{"add"}, std::pair{arithmetic_types, std::string{"arithmetic"}}}, + std::pair{std::string{"min"}, std::pair{minmax_types, std::string{"minmax"}}}, + std::pair{std::string{"max"}, std::pair{minmax_types, std::string{"minmax"}}}, + std::pair{std::string{"or"}, std::pair{bitwise_types, std::string{"bitwise"}}}, + std::pair{std::string{"xor"}, std::pair{bitwise_types, std::string{"bitwise"}}}, + std::pair{std::string{"and"}, std::pair{bitwise_types, std::string{"bitwise"}}}, + }; + + // Memory order dispatcher + out << R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_memory_order_dispatch(_Fn& __cuda_fetch, int __memorder, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_acquire{}); break; + case __ATOMIC_ACQ_REL: __cuda_fetch(__atomic_cuda_acq_rel{}); break; + case __ATOMIC_RELEASE: __cuda_fetch(__atomic_cuda_release{}); break; + case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_fetch(__atomic_cuda_volatile{}); break; + case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_volatile{}); break; + default: assert(0); + } + ) + ) +} +)XXX"; + + // Argument ID Reference + // 0 - Atomic Operation + // 1 - Operand Type + // 2 - Operand Size + // 3 - Type Constraint + // 4 - Memory Order + // 5 - Memory Order function tag + // 6 - Scope Constraint + // 7 - Scope function tag + const std::string asm_intrinsic_format = R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_{0}( + _Type* __ptr, _Type& __dst, _Type __op, {5}, __atomic_cuda_operand_{1}{2}, {7}) +{{ asm volatile("atom.{0}{4}{6}.{1}{2} %0,[%1],%2;" : "={3}"(__dst) : "l"(__ptr), "{3}"(__op) : "memory"); }})XXX"; + + // 0 - Atomic Operation + // 1 - Operand type constraint + // 2 - Pointer op skip_v + const std::string fetch_bind_invoke = R"XXX( +template +struct __cuda_atomic_bind_fetch_{0} {{ + _Type* __ptr; + _Type* __dst; + _Type* __op; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {{ + __cuda_atomic_fetch_{0}(__ptr, *__dst, *__op, _Atomic_Memorder{{}}, _Tag{{}}, _Sco{{}}); + }} +}}; +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_{0}_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{{ + {2} + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_{1}<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_{1}<_Type>::__tag; + _Type __dst{{}}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_{0}<__proxy_t, __proxy_tag, _Sco> __bound_{0}{{__ptr_proxy, __dst_proxy, __op_proxy}}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_{0}, __memorder, _Sco{{}}); + return __dst; +}} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_{0}_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{{ + {2} + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_{1}<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_{1}<_Type>::__tag; + _Type __dst{{}}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_{0}<__proxy_t, __proxy_tag, _Sco> __bound_{0}{{__ptr_proxy, __dst_proxy, __op_proxy}}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_{0}, __memorder, _Sco{{}}); + return __dst; +}} +)XXX"; + + constexpr size_t supported_sizes[] = { + 32, + 64, + }; + + constexpr Semantic supported_semantics[] = { + Semantic::Acquire, + Semantic::Relaxed, + Semantic::Release, + Semantic::Acq_Rel, + Semantic::Volatile, + }; + + constexpr Scope supported_scopes[] = { + Scope::CTA, + Scope::Cluster, + Scope::GPU, + Scope::System, + }; + + for (auto& op_kp : op_support_map) + { + const auto& op_name = op_kp.first; + const auto& op_type_kp = op_kp.second; + const auto& type_list = op_type_kp.first; + const auto& deduction = op_type_kp.second; + for (auto type : type_list) + { + for (auto size : supported_sizes) + { + const std::string proxy_type = operand_proxy_type(type, size); + for (auto sco : supported_scopes) + { + for (auto sem : supported_semantics) + { + // There is no atom.add.s64 + if (op_name == "add" && type == Operand::Signed && size == 64) + { + continue; + } + out << fmt::format( + asm_intrinsic_format, + /* 0 */ op_name, + /* 1 */ operand(type), + /* 2 */ size, + /* 3 */ constraints(type, size), + /* 4 */ semantic(sem), + /* 5 */ semantic_tag(sem), + /* 6 */ scope(sco), + /* 7 */ scope_tag(sco)); + } + } + } + } + out << "\n" << fmt::format(fetch_bind_invoke, op_name, deduction, fetch_op_skip_v(op_name)); + } + + out << R"XXX( +template +static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{}); +} +)XXX"; +} + +#endif // FETCH_OPS_H diff --git a/libcudacxx/codegen/generators/header.h b/libcudacxx/codegen/generators/header.h new file mode 100644 index 0000000000..39a848314b --- /dev/null +++ b/libcudacxx/codegen/generators/header.h @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef HEADER_H +#define HEADER_H + +#include + +inline void FormatHeader(std::ostream& out) +{ + std::string header = R"XXX( +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate +// clang-format off + +#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H +#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +#if defined(_CCCL_CUDA_COMPILER) +)XXX"; + + out << header; +} + +inline void FormatTail(std::ostream& out) +{ + std::string tail = R"XXX( +#endif // defined(_CCCL_CUDA_COMPILER) + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H + +// clang-format on +)XXX"; + + out << tail; +} + +#endif // HEADER_H diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h new file mode 100644 index 0000000000..d4aec3da54 --- /dev/null +++ b/libcudacxx/codegen/generators/ld_st.h @@ -0,0 +1,353 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef LD_ST_H +#define LD_ST_H + +#include + +#include "definitions.h" +#include + +inline std::string semantic_ld_st(Semantic sem) +{ + static std::map sem_map = { + std::pair{Semantic::Relaxed, ".relaxed"}, + std::pair{Semantic::Release, ".release"}, + std::pair{Semantic::Acquire, ".acquire"}, + std::pair{Semantic::Volatile, ".volatile"}, + }; + return sem_map[sem]; +} + +inline std::string scope_ld_st(Semantic sem, Scope sco) +{ + if (sem == Semantic::Volatile) + { + return ""; + } + return scope(sco); +} + +inline void FormatLoad(std::ostream& out) +{ + out << R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__cuda_load, int __memorder, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_acquire{}); break; + case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_volatile{}); break; + default: assert(0); + } + ) + ) +} +)XXX"; + + // Argument ID Reference + // 0 - Operand Type + // 1 - Operand Size + // 2 - Constraint + // 3 - Memory order + // 4 - Memory order semantic + // 5 - Scope tag + // 6 - Scope semantic + // 7 - Mmio tag + // 8 - Mmio semantic + const std::string asm_intrinsic_format_128 = R"XXX( + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7}) +{{ + asm volatile(R"YYY( + .reg .b128 _d; + ld{8}{4}{6}.b128 [%2],_d; + mov.b128 _d, {{%0, %1}}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +}})XXX"; + const std::string asm_intrinsic_format = R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7}) +{{ asm volatile("ld{8}{4}{6}.{0}{1} %0,[%1];" : "={2}"(__dst) : "l"(__ptr) : "memory"); }})XXX"; + + constexpr size_t supported_sizes[] = { + 16, + 32, + 64, + 128, + }; + + constexpr Operand supported_types[] = { + Operand::Bit, + Operand::Floating, + Operand::Unsigned, + Operand::Signed, + }; + + constexpr Semantic supported_semantics[] = { + Semantic::Acquire, + Semantic::Relaxed, + Semantic::Volatile, + }; + + constexpr Scope supported_scopes[] = { + Scope::CTA, + Scope::Cluster, + Scope::GPU, + Scope::System, + }; + + constexpr Mmio mmio_states[] = { + Mmio::Disabled, + Mmio::Enabled, + }; + + for (auto size : supported_sizes) + { + for (auto type : supported_types) + { + for (auto sem : supported_semantics) + { + for (auto sco : supported_scopes) + { + for (auto mm : mmio_states) + { + if (size == 16 && type == Operand::Floating) + { + continue; + } + if (size == 128 && type != Operand::Bit) + { + continue; + } + if ((mm == Mmio::Enabled) && ((sco != Scope::System) || (sem != Semantic::Relaxed))) + { + continue; + } + out << fmt::format( + (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format, + /* 0 */ operand(type), + /* 1 */ size, + /* 2 */ constraints(type, size), + /* 3 */ semantic_tag(sem), + /* 4 */ semantic_ld_st(sem), + /* 5 */ scope_tag(sco), + /* 6 */ scope_ld_st(sem, sco), + /* 7 */ mmio_tag(mm), + /* 8 */ mmio(mm)); + } + } + } + } + } + out << "\n" + << R"XXX( +template +struct __cuda_atomic_bind_load { + const _Type* __ptr; + _Type* __dst; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_load(__ptr, *__dst, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{}); + } +}; +template +static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __dst, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + const __proxy_t* __ptr_proxy = reinterpret_cast(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy}; + __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr, _Type& __dst, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + const __proxy_t* __ptr_proxy = reinterpret_cast(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy}; + __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{}); +} +)XXX"; +} + +inline void FormatStore(std::ostream& out) +{ + out << R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &__cuda_store, int __memorder, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store(__atomic_cuda_release{}); break; + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); + case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_volatile{}); break; + default: assert(0); + } + ) + ) +} +)XXX"; + // Argument ID Reference + // 0 - Operand Type + // 1 - Operand Size + // 2 - Constraint + // 3 - Memory order + // 4 - Memory order semantic + // 5 - Scope tag + // 6 - Scope semantic + // 7 - Mmio tag + // 8 - Mmio semantic + const std::string asm_intrinsic_format_128 = R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7}) +{{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {{%1, %2}}, _v; + st{8}{4}{6}.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +}})XXX"; + const std::string asm_intrinsic_format = R"XXX( +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7}) +{{ asm volatile("st{8}{4}{6}.{0}{1} [%0],%1;" :: "l"(__ptr), "{2}"(__val) : "memory"); }})XXX"; + + constexpr size_t supported_sizes[] = { + 16, + 32, + 64, + 128, + }; + + constexpr Operand supported_types[] = { + Operand::Bit, + }; + + constexpr Semantic supported_semantics[] = { + Semantic::Release, + Semantic::Relaxed, + Semantic::Volatile, + }; + + constexpr Scope supported_scopes[] = { + Scope::CTA, + Scope::Cluster, + Scope::GPU, + Scope::System, + }; + + constexpr Mmio mmio_states[] = { + Mmio::Disabled, + Mmio::Enabled, + }; + + for (auto size : supported_sizes) + { + for (auto type : supported_types) + { + for (auto sem : supported_semantics) + { + for (auto sco : supported_scopes) + { + for (auto mm : mmio_states) + { + if (size == 16 && type == Operand::Floating) + { + continue; + } + if (size == 128 && type != Operand::Bit) + { + continue; + } + if ((mm == Mmio::Enabled) && ((sco != Scope::System) || (sem != Semantic::Relaxed))) + { + continue; + } + out << fmt::format( + (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format, + /* 0 */ operand(type), + /* 1 */ size, + /* 2 */ constraints(type, size), + /* 3 */ semantic_tag(sem), + /* 4 */ semantic_ld_st(sem), + /* 5 */ scope_tag(sco), + /* 6 */ scope_ld_st(sem, sco), + /* 7 */ mmio_tag(mm), + /* 8 */ mmio(mm)); + } + } + } + } + } + out << "\n" + << R"XXX( +template +struct __cuda_atomic_bind_store { + _Type* __ptr; + _Type* __val; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_store(__ptr, *__val, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{}); + } +}; +template +static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy}; + __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy}; + __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); +} +)XXX"; +} + +#endif // LD_ST_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/common.h b/libcudacxx/include/cuda/std/__atomic/functions/common.h new file mode 100644 index 0000000000..415c59a9be --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/functions/common.h @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_COMMON_H +#define _LIBCUDACXX___ATOMIC_FUNCTIONS_COMMON_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +struct __atomic_ptr_skip +{ + static constexpr auto __skip = 1; +}; + +template +struct __atomic_ptr_skip<_Tp*> +{ + static constexpr auto __skip = sizeof(_Tp); +}; + +// FIXME: Haven't figured out what the spec says about using arrays with +// atomic_fetch_add. Force a failure rather than creating bad behavior. +template +struct __atomic_ptr_skip<_Tp[]> +{}; +template +struct __atomic_ptr_skip<_Tp[n]> +{}; + +template +using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_COMMON_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 1d791ca42e..0e525bf296 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -13,6 +13,8 @@ #include +#include "cuda_ptx_generated.h" + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -31,232 +33,372 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD #if defined(_CCCL_CUDA_COMPILER) -template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda( - void volatile* __ptr, - _Tp* __expected, - const _Tp __desired, - bool __weak, - int __success_memorder, - int __failure_memorder, - _Sco) -{ - using __proxy_t = _If; - __proxy_t __old = 0; - __proxy_t __new = 0; - memcpy(&__old, __expected, sizeof(__proxy_t)); - memcpy(&__new, &__desired, sizeof(__proxy_t)); - bool __result = - __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); - memcpy(__expected, &__old, sizeof(__proxy_t)); - return __result; -} -template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda( - void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco) -{ - using __proxy_t = _If; - __proxy_t __old = 0; - __proxy_t __new = 0; - memcpy(&__old, __expected, sizeof(__proxy_t)); - memcpy(&__new, &__desired, sizeof(__proxy_t)); - bool __result = - __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); - memcpy(__expected, &__old, sizeof(__proxy_t)); - return __result; -} -template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -{ - using __proxy_t = _If; - __proxy_t __old = 0; - __proxy_t __new = 0; - memcpy(&__new, __val, sizeof(__proxy_t)); - __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); - memcpy(__ret, &__old, sizeof(__proxy_t)); -} -template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -{ - using __proxy_t = _If; - __proxy_t __old = 0; - __proxy_t __new = 0; - memcpy(&__new, __val, sizeof(__proxy_t)); - __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); - memcpy(__ret, &__old, sizeof(__proxy_t)); -} - -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda( - _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, _Sco) -{ - auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); - auto const __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; - auto const __mask = ((1 << sizeof(_Tp) * 8) - 1) << __offset; - - uint32_t __old = *__expected << __offset; - uint32_t __old_value; - while (1) - { - __old_value = (__old & __mask) >> __offset; - if (__old_value != *__expected) - { - break; - } - uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset); - if (__atomic_compare_exchange_cuda( - __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{})) - { - return true; - } - } - *__expected = __old_value; - return false; -} - -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -{ - _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{})) - ; - *__ret = __expected; -} - -template = 0> -_CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +template +_CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco) { _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - _Tp __desired = __expected + __val; + _Tp __desired = __op(__expected); while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) { - __desired = __expected + __val; + __desired = __op(__expected); } return __expected; } - -template ::value, int> = 0> -_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +template +_CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp volatile* __ptr, const _Fn& __op, int __memorder, _Sco) { _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - _Tp __desired = __expected > __val ? __expected : __val; - - while (__desired == __val - && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) + _Tp __desired = __op(__expected); + while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) { - __desired = __expected > __val ? __expected : __val; + __desired = __op(__expected); } - return __expected; } -template ::value, int> = 0> -_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +template +_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp* __ptr, int __memorder, _Sco) { - _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - _Tp __desired = __expected < __val ? __expected : __val; - - while (__desired == __val - && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) - { - __desired = __expected < __val ? __expected : __val; - } - - return __expected; + _Tp __ret; + __atomic_load_cuda(__ptr, __ret, __memorder, _Sco{}); + return __ret; } - -template = 0> -_CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +template +_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco) { - _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - _Tp __desired = __expected - __val; - while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) - { - __desired = __expected - __val; - } - return __expected; + _Tp __ret; + __atomic_load_cuda(__ptr, __ret, __memorder, _Sco{}); + return __ret; } -template = 0> -_CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +template +_CCCL_DEVICE void __atomic_store_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco) { - _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - _Tp __desired = __expected & __val; - while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) - { - __desired = __expected & __val; - } - return __expected; + __atomic_store_cuda(__ptr, __val, __memorder, _Sco{}); } - -template = 0> -_CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +template +_CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco) { - _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - _Tp __desired = __expected ^ __val; - while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) - { - __desired = __expected ^ __val; - } - return __expected; + __atomic_store_cuda(__ptr, __val, __memorder, _Sco{}); } -template = 0> -_CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +template +_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco) { - _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); - _Tp __desired = __expected | __val; - while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) - { - __desired = __expected | __val; - } - return __expected; + _Tp __ret; + __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); + return __ret; } - template -_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco) { _Tp __ret; - __atomic_load_cuda(__ptr, &__ret, __memorder, _Sco{}); + __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); return __ret; } -template -_CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco) +template = 0> +_CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { - __atomic_store_cuda(__ptr, &__val, __memorder, _Sco{}); + return __atomic_fetch_update_cuda( + __ptr, + [__val](_Tp __old) { + return __old < __val ? __old : __val; + }, + __memorder, + _Sco{}); } - -template -_CCCL_DEVICE bool __atomic_compare_exchange_n_cuda( - _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco) +template = 0> +_CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) { - return __atomic_compare_exchange_cuda( - __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{}); + return __atomic_fetch_update_cuda( + __ptr, + [__val](_Tp __old) { + return __old < __val ? __old : __val; + }, + __memorder, + _Sco{}); } -template -_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco) +template = 0> +_CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { - _Tp __ret; - __atomic_exchange_cuda(__ptr, &__val, &__ret, __memorder, _Sco{}); - return __ret; + return __atomic_fetch_update_cuda( + __ptr, + [__val](_Tp __old) { + return __old > __val ? __old : __val; + }, + __memorder, + _Sco{}); +} +template = 0> +_CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) +{ + return __atomic_fetch_update_cuda( + __ptr, + [__val](_Tp __old) { + return __old > __val ? __old : __val; + }, + __memorder, + _Sco{}); } +// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> +// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( +// void volatile* __ptr, +// _Tp* __expected, +// const _Tp __desired, +// bool __weak, +// int __success_memorder, +// int __failure_memorder, +// _Sco) +// { +// using __proxy_t = _If; +// __proxy_t __old = 0; +// __proxy_t __new = 0; +// memcpy(&__old, __expected, sizeof(__proxy_t)); +// memcpy(&__new, &__desired, sizeof(__proxy_t)); +// bool __result = +// __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); +// memcpy(__expected, &__old, sizeof(__proxy_t)); +// return __result; +// } +// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> +// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( +// void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, +// _Sco) +// { +// using __proxy_t = _If; +// __proxy_t __old = 0; +// __proxy_t __new = 0; +// memcpy(&__old, __expected, sizeof(__proxy_t)); +// memcpy(&__new, &__desired, sizeof(__proxy_t)); +// bool __result = +// __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); +// memcpy(__expected, &__old, sizeof(__proxy_t)); +// return __result; +// } +// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> +// _CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) +// { +// using __proxy_t = _If; +// __proxy_t __old = 0; +// __proxy_t __new = 0; +// memcpy(&__new, __val, sizeof(__proxy_t)); +// __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); +// memcpy(__ret, &__old, sizeof(__proxy_t)); +// } +// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> +// _CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) +// { +// using __proxy_t = _If; +// __proxy_t __old = 0; +// __proxy_t __new = 0; +// memcpy(&__new, __val, sizeof(__proxy_t)); +// __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); +// memcpy(__ret, &__old, sizeof(__proxy_t)); +// } + +// template = 0> +// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( +// _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, +// _Sco) +// { +// auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); +// auto const __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; +// auto const __mask = ((1 << sizeof(_Tp) * 8) - 1) << __offset; + +// uint32_t __old = *__expected << __offset; +// uint32_t __old_value; +// while (1) +// { +// __old_value = (__old & __mask) >> __offset; +// if (__old_value != *__expected) +// { +// break; +// } +// uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset); +// if (__atomic_compare_exchange_cuda( +// __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{})) +// { +// return true; +// } +// } +// *__expected = __old_value; +// return false; +// } + +// template = 0> +// _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{})) +// ; +// *__ret = __expected; +// } + +// template = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected + __val; +// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected + __val; +// } +// return __expected; +// } + +// template ::value, int> = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected > __val ? __expected : __val; + +// while (__desired == __val +// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected > __val ? __expected : __val; +// } + +// return __expected; +// } +// template ::value, int> = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected > __val ? __expected : __val; + +// while (__desired == __val +// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected > __val ? __expected : __val; +// } + +// return __expected; +// } + +// template ::value, int> = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected < __val ? __expected : __val; + +// while (__desired == __val +// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected < __val ? __expected : __val; +// } + +// return __expected; +// } +// template ::value, int> = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected < __val ? __expected : __val; + +// while (__desired == __val +// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected < __val ? __expected : __val; +// } + +// return __expected; +// } + +// template = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected - __val; +// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected - __val; +// } +// return __expected; +// } + +// template = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected & __val; +// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected & __val; +// } +// return __expected; +// } + +// template = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected ^ __val; +// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected ^ __val; +// } +// return __expected; +// } + +// template = 0> +// _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) +// { +// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); +// _Tp __desired = __expected | __val; +// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) +// { +// __desired = __expected | __val; +// } +// return __expected; +// } + +// template +// _CCCL_DEVICE bool __atomic_compare_exchange_n_cuda( +// _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, +// _Sco) +// { +// return __atomic_compare_exchange_cuda( +// __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{}); +// } + +// template +// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco) +// { +// _Tp __ret; +// __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); +// return __ret; +// } +// template +// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco) +// { +// _Tp __ret; +// __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); +// return __ret; +// } + _CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int) { asm volatile("" ::: "memory"); diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h index 2ebfa4ea3a..e72144b68c 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h @@ -1,10 +1,11 @@ + //===----------------------------------------------------------------------===// // // Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -33,23 +34,46 @@ #include #include +#include +#include _LIBCUDACXX_BEGIN_NAMESPACE_STD #if defined(_CCCL_CUDA_COMPILER) -static inline _CCCL_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); } -static inline _CCCL_DEVICE void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); } -static inline _CCCL_DEVICE void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); } -static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thread_scope_block_tag) { +static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_block_tag) +{ asm volatile("membar.cta;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_device_tag) +{ asm volatile("membar.gl;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_system_tag) +{ asm volatile("membar.sys;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_block_tag, __atomic_cuda_acq_rel) +{ asm volatile("fence.cta.acq_rel;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_block_tag, __atomic_cuda_seq_cst) +{ asm volatile("fence.cta.sc;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_cluster_tag, __atomic_cuda_acq_rel) +{ asm volatile("fence.cluster.acq_rel;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_cluster_tag, __atomic_cuda_seq_cst) +{ asm volatile("fence.cluster.sc;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_device_tag, __atomic_cuda_acq_rel) +{ asm volatile("fence.gpu.acq_rel;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_device_tag, __atomic_cuda_seq_cst) +{ asm volatile("fence.gpu.sc;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_system_tag, __atomic_cuda_acq_rel) +{ asm volatile("fence.sys.acq_rel;" ::: "memory"); } +static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_system_tag, __atomic_cuda_seq_cst) +{ asm volatile("fence.sys.sc;" ::: "memory"); } + +template +static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, _Sco) { NV_DISPATCH_TARGET( NV_PROVIDES_SM_70, ( switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); break; + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); break; case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); - case __ATOMIC_RELEASE: __cuda_fence_acq_rel_block(); break; + case __ATOMIC_RELEASE: __cuda_atomic_fence(_Sco{}, __atomic_cuda_acq_rel{}); break; case __ATOMIC_RELAXED: break; default: assert(0); } @@ -60,6802 +84,4015 @@ static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thr case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); - case __ATOMIC_RELEASE: __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); break; case __ATOMIC_RELAXED: break; default: assert(0); } ) ) } -template static inline _CCCL_DEVICE void __cuda_load_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) { - uint32_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_block(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_block(__ptr, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) { - uint32_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_block(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_block(__ptr, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_load_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) { - uint64_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_block(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_block(__ptr, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) { - uint64_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_block(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_block(__ptr, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_store_relaxed_32_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.cta.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_release_32_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.cta.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_volatile_32_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) { - uint32_t __tmp = 0; - memcpy(&__tmp, __val, 4); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_32_block(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) { - uint32_t __tmp = 0; - memcpy(&__tmp, __val, 4); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_32_block(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template static inline _CCCL_DEVICE void __cuda_store_relaxed_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.cta.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_release_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.cta.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_volatile_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) { - uint64_t __tmp = 0; - memcpy(&__tmp, __val, 8); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_64_block(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) { - uint64_t __tmp = 0; - memcpy(&__tmp, __val, 8); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_64_block(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_block(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_block(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) + +template +static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__cuda_load, int __memorder, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_acquire{}); break; + case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_volatile{}); break; + default: assert(0); + } ) - return __tmp; + ) } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; + +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.acquire.cta.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.acquire.cluster.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.acquire.gpu.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.acquire.sys.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.relaxed.cta.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.relaxed.cluster.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.relaxed.gpu.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.relaxed.sys.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.mmio.relaxed.sys.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.volatile.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.volatile.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.volatile.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); +} + template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _d; + ld.volatile.b128 [%2],_d; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; + +template +struct __cuda_atomic_bind_load { + const _Type* __ptr; + _Type* __dst; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_load(__ptr, *__dst, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{}); + } +}; +template +static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __dst, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + const __proxy_t* __ptr_proxy = reinterpret_cast(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy}; + __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr, _Type& __dst, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + const __proxy_t* __ptr_proxy = reinterpret_cast(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy}; + __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{}); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) + +template +static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &__cuda_store, int __memorder, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store(__atomic_cuda_release{}); break; + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); + case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_volatile{}); break; + default: assert(0); + } ) - return __tmp; + ) } -template -_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("st.mmio.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cta.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cluster.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.gpu.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.sys.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cta.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cluster.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.gpu.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.sys.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("st.mmio.relaxed.sys.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cta.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cluster.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.gpu.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.sys.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cta.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cluster.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.gpu.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.sys.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("st.mmio.relaxed.sys.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.release.cta.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.release.cluster.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.release.gpu.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.release.sys.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.relaxed.cta.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.relaxed.cluster.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.relaxed.gpu.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.relaxed.sys.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.mmio.relaxed.sys.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.volatile.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.volatile.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.volatile.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ + asm volatile(R"YYY( + .reg .b128 _v; + mov.b128 {%1, %2}, _v; + st.volatile.b128 [%0],_v; +)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory"); } -template -_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp = -__tmp; - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +struct __cuda_atomic_bind_store { + _Type* __ptr; + _Type* __val; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_store(__ptr, *__val, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{}); + } +}; +template +static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy}; + __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy}; + __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); } -template -_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) + +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch(_Fn& __cuda_cas, int __success_memorder, int __failure_memorder, _Sco) { + bool __res = false; + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_acquire{}); break; + case __ATOMIC_ACQ_REL: __res = __cuda_cas(__atomic_cuda_acq_rel{}); break; + case __ATOMIC_RELEASE: __res = __cuda_cas(__atomic_cuda_release{}); break; + case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_relaxed{}); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); + case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __res = __cuda_cas(__atomic_cuda_volatile{}); break; + case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_volatile{}); break; + default: assert(0); + } ) - memcpy(&__ret, &__tmp, 8); - return __ret; + ) + return __res; } -template -_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp = -__tmp; - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.cas.acquire.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.acquire.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.cas.acquire.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.cas.acquire.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.cas.relaxed.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.relaxed.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.cas.relaxed.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.cas.relaxed.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.cas.release.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.release.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.cas.release.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.cas.release.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.cas.acq_rel.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.acq_rel.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.cas.acq_rel.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.cas.acq_rel.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.cas.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.cas.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.cas.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.acquire.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.relaxed.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.release.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.acq_rel.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.acquire.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.relaxed.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.release.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.acq_rel.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.cas.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acquire.cta.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acquire.cluster.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acquire.gpu.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acquire.sys.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.relaxed.cta.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.relaxed.sys.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.release.cta.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.release.cluster.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.release.gpu.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.release.sys.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.cta.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.cluster.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.gpu.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( +.reg .b128 _d; +.reg .b128 _v; +mov.b128 {%0, %1}, _d; +mov.b128 {%4, %5}, _v; +atom.cas.sys.b128 _d,[%2],_d,_v; +mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; } + +template +struct __cuda_atomic_bind_compare_exchange { + _Type* __ptr; + _Type* __exp; + _Type* __des; + + template + inline _CCCL_DEVICE bool operator()(_Atomic_Memorder) { + return __cuda_atomic_compare_exchange(__ptr, *__exp, *__exp, *__des, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template +static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy}; + return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type volatile* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy}; + return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{}); } -static inline _CCCL_DEVICE void __cuda_membar_device() { asm volatile("membar.gl;":::"memory"); } -static inline _CCCL_DEVICE void __cuda_fence_acq_rel_device() { asm volatile("fence.acq_rel.gpu;":::"memory"); } -static inline _CCCL_DEVICE void __cuda_fence_sc_device() { asm volatile("fence.sc.gpu;":::"memory"); } -static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thread_scope_device_tag) { + +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn& __cuda_exch, int __memorder, _Sco) { NV_DISPATCH_TARGET( NV_PROVIDES_SM_70, ( switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); break; + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); - case __ATOMIC_RELEASE: __cuda_fence_acq_rel_device(); break; - case __ATOMIC_RELAXED: break; + case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_acquire{}); break; + case __ATOMIC_ACQ_REL: __cuda_exch(__atomic_cuda_acq_rel{}); break; + case __ATOMIC_RELEASE: __cuda_exch(__atomic_cuda_release{}); break; + case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_relaxed{}); break; default: assert(0); } ), NV_IS_DEVICE, ( switch (__memorder) { case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); - case __ATOMIC_RELEASE: __cuda_membar_device(); break; - case __ATOMIC_RELAXED: break; + case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_exch(__atomic_cuda_volatile{}); break; + case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_volatile{}); break; default: assert(0); } ) ) } -template static inline _CCCL_DEVICE void __cuda_load_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) { - uint32_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_device(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_device(__ptr, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) { - uint32_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_device(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_device(__ptr, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_load_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) { - uint64_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_device(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_device(__ptr, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) { - uint64_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_device(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_device(__ptr, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_store_relaxed_32_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.gpu.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_release_32_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.gpu.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_volatile_32_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) { - uint32_t __tmp = 0; - memcpy(&__tmp, __val, 4); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_32_device(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) { - uint32_t __tmp = 0; - memcpy(&__tmp, __val, 4); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_32_device(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template static inline _CCCL_DEVICE void __cuda_store_relaxed_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.gpu.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_release_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.gpu.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_volatile_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) { - uint64_t __tmp = 0; - memcpy(&__tmp, __val, 8); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_64_device(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) { - uint64_t __tmp = 0; - memcpy(&__tmp, __val, 8); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_64_device(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_device(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_device(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template -_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; -} -template -_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp = -__tmp; - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; -} -template -_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.exch.acquire.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.acquire.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.exch.acquire.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.exch.acquire.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.exch.relaxed.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.relaxed.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.exch.relaxed.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.exch.relaxed.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.exch.release.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.release.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.exch.release.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.exch.release.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.exch.acq_rel.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.acq_rel.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.exch.acq_rel.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.exch.acq_rel.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag) +{ asm volatile("atom.exch.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag) +{ asm volatile("atom.exch.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag) +{ asm volatile("atom.exch.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.exch.acquire.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.exch.acquire.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.exch.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.exch.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.exch.release.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.release.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.exch.release.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.exch.release.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.exch.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.exch.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.exch.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.exch.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.exch.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.exch.acquire.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.exch.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.exch.acquire.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.exch.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.exch.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.exch.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.exch.release.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.release.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.exch.release.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.exch.release.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.exch.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.exch.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.exch.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.exch.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.exch.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.exch.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.exch.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acquire.cta.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acquire.cluster.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acquire.gpu.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acquire.sys.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.relaxed.cta.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.relaxed.cluster.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.relaxed.gpu.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.relaxed.sys.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.release.cta.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.release.cluster.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.release.gpu.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.release.sys.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acq_rel.cta.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acq_rel.cluster.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acq_rel.gpu.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.acq_rel.sys.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.cta.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.cluster.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.gpu.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); +} +template +static inline _CCCL_DEVICE void __cuda_atomic_exchange( + _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag) +{ + asm volatile(R"YYY( + .reg .b128 _d; + .reg .b128 _v; + mov.b128 {%3, %4}, _v; + atom.exch.sys.b128 _d,[%2],_v; + mov.b128 _d, {%0, %1}; +)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory"); } -template -_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp = -__tmp; - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +struct __cuda_atomic_bind_exchange { + _Type* __ptr; + _Type* __old; + _Type* __new; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_exchange(__ptr, *__old, *__new, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template +static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __old, _Type __new, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy}; + __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type volatile* __ptr, _Type& __old, _Type __new, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy}; + __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{}); } -static inline _CCCL_DEVICE void __cuda_membar_system() { asm volatile("membar.sys;":::"memory"); } -static inline _CCCL_DEVICE void __cuda_fence_acq_rel_system() { asm volatile("fence.acq_rel.sys;":::"memory"); } -static inline _CCCL_DEVICE void __cuda_fence_sc_system() { asm volatile("fence.sc.sys;":::"memory"); } -static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thread_scope_system_tag) { + +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_memory_order_dispatch(_Fn& __cuda_fetch, int __memorder, _Sco) { NV_DISPATCH_TARGET( NV_PROVIDES_SM_70, ( switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); break; + case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH(); case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); - case __ATOMIC_RELEASE: __cuda_fence_acq_rel_system(); break; - case __ATOMIC_RELAXED: break; + case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_acquire{}); break; + case __ATOMIC_ACQ_REL: __cuda_fetch(__atomic_cuda_acq_rel{}); break; + case __ATOMIC_RELEASE: __cuda_fetch(__atomic_cuda_release{}); break; + case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_relaxed{}); break; default: assert(0); } ), NV_IS_DEVICE, ( switch (__memorder) { case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); + case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH(); case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH(); - case __ATOMIC_RELEASE: __cuda_membar_system(); break; - case __ATOMIC_RELAXED: break; + case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break; + case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_fetch(__atomic_cuda_volatile{}); break; + case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_volatile{}); break; default: assert(0); } ) ) } -template static inline _CCCL_DEVICE void __cuda_load_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) { - uint32_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_system(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_system(__ptr, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) { - uint32_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_system(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_system(__ptr, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_load_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_load_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) { - uint64_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_system(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_system(__ptr, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) { - uint64_t __tmp = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_system(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_system(__ptr, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_store_relaxed_32_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.sys.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_release_32_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.sys.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_volatile_32_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) { - uint32_t __tmp = 0; - memcpy(&__tmp, __val, 4); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_32_system(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) { - uint32_t __tmp = 0; - memcpy(&__tmp, __val, 4); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_32_system(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template static inline _CCCL_DEVICE void __cuda_store_relaxed_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.sys.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_release_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.sys.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_store_volatile_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) { - uint64_t __tmp = 0; - memcpy(&__tmp, __val, 8); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_64_system(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template = 0> -_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) { - uint64_t __tmp = 0; - memcpy(&__tmp, __val, 8); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_RELEASE: __cuda_store_release_64_system(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_system(__ptr, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH(); - case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_system(__ptr, __tmp); break; - default: assert(0); - } - ) - ) -} -template static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 4); -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { - auto __old = *__expected; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break; - default: assert(0); - } - ) - ) - return (__old == *__expected); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_exchange_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template = 0> -_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { - _Type __tmp = *__val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(__ret, &__tmp, 8); -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; -} -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; + +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_block_tag) +{ asm volatile("atom.add.acquire.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_block_tag) +{ asm volatile("atom.add.relaxed.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_block_tag) +{ asm volatile("atom.add.release.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_block_tag) +{ asm volatile("atom.add.acq_rel.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_block_tag) +{ asm volatile("atom.add.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acquire.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.relaxed.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.release.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acq_rel.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_device_tag) +{ asm volatile("atom.add.acquire.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_device_tag) +{ asm volatile("atom.add.relaxed.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_device_tag) +{ asm volatile("atom.add.release.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_device_tag) +{ asm volatile("atom.add.acq_rel.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_device_tag) +{ asm volatile("atom.add.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_system_tag) +{ asm volatile("atom.add.acquire.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_system_tag) +{ asm volatile("atom.add.relaxed.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_system_tag) +{ asm volatile("atom.add.release.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_system_tag) +{ asm volatile("atom.add.acq_rel.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_system_tag) +{ asm volatile("atom.add.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_block_tag) +{ asm volatile("atom.add.acquire.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_block_tag) +{ asm volatile("atom.add.relaxed.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_block_tag) +{ asm volatile("atom.add.release.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_block_tag) +{ asm volatile("atom.add.acq_rel.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_block_tag) +{ asm volatile("atom.add.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acquire.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.relaxed.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.release.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acq_rel.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_device_tag) +{ asm volatile("atom.add.acquire.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_device_tag) +{ asm volatile("atom.add.relaxed.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_device_tag) +{ asm volatile("atom.add.release.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_device_tag) +{ asm volatile("atom.add.acq_rel.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_device_tag) +{ asm volatile("atom.add.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_system_tag) +{ asm volatile("atom.add.acquire.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_system_tag) +{ asm volatile("atom.add.relaxed.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_system_tag) +{ asm volatile("atom.add.release.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_system_tag) +{ asm volatile("atom.add.acq_rel.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_system_tag) +{ asm volatile("atom.add.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.add.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.add.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.add.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.add.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.add.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acquire.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.relaxed.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.release.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acq_rel.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.add.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.add.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.add.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.add.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.add.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.add.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.add.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.add.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.add.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.add.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.add.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.add.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.add.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acquire.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.relaxed.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.release.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acq_rel.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.add.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.add.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.add.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.add.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.add.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.add.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.add.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.add.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.add.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.add.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.add.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.add.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.add.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.add.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.add.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.add.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acquire.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.relaxed.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.release.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.acq_rel.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.add.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.add.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.add.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.add.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.add.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.add.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.add.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.add.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.add.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.add.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.add.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } + +template +struct __cuda_atomic_bind_fetch_add { + _Type* __ptr; + _Type* __dst; + _Type* __op; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_fetch_add(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_arithmetic<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{}); + return __dst; +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_arithmetic<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{}); + return __dst; } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op; -asm volatile("atom.add.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; + +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.and.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.and.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.and.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.and.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.and.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.and.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.and.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.and.release.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.and.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.and.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.and.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.and.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.and.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.and.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.and.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.and.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.and.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.and.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.and.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.and.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.and.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.and.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.and.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.and.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.and.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.and.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.and.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.and.release.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.and.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.and.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.and.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.and.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.and.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.and.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.and.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.and.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.and.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.and.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.and.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.and.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } + +template +struct __cuda_atomic_bind_fetch_and { + _Type* __ptr; + _Type* __dst; + _Type* __op; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_fetch_and(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{}); + return __dst; +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{}); + return __dst; } -template = 0> -_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) { - _Type __tmp = __val; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - return __tmp; + +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.max.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.max.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.max.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.max.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.max.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acquire.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.relaxed.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.release.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acq_rel.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.max.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.max.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.max.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.max.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.max.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.max.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.max.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.max.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.max.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.max.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.max.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.max.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.max.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.max.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.max.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acquire.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.relaxed.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.release.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acq_rel.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.max.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.max.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.max.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.max.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.max.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.max.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.max.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.max.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.max.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.max.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.max.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.max.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.max.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.max.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.max.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acquire.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.relaxed.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.release.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acq_rel.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.max.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.max.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.max.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.max.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.max.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.max.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.max.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.max.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.max.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.max.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.max.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.max.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.max.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.max.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.max.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.max.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acquire.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.relaxed.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.release.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.acq_rel.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.max.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.max.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.max.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.max.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.max.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.max.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.max.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.max.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.max.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.max.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.max.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } + +template +struct __cuda_atomic_bind_fetch_max { + _Type* __ptr; + _Type* __dst; + _Type* __op; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_fetch_max(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{}); + return __dst; +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{}); + return __dst; } -template -_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.min.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.min.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.min.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.min.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag) +{ asm volatile("atom.min.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acquire.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.relaxed.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.release.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acq_rel.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.min.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.min.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.min.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.min.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag) +{ asm volatile("atom.min.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.min.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.min.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.min.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.min.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag) +{ asm volatile("atom.min.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.min.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.min.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.min.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.min.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag) +{ asm volatile("atom.min.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acquire.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.relaxed.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.release.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acq_rel.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.min.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.min.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.min.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.min.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag) +{ asm volatile("atom.min.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.min.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.min.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.min.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.min.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag) +{ asm volatile("atom.min.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.min.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.min.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.min.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.min.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag) +{ asm volatile("atom.min.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acquire.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.relaxed.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.release.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acq_rel.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag) +{ asm volatile("atom.min.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.min.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.min.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.min.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.min.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag) +{ asm volatile("atom.min.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.min.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.min.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.min.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.min.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag) +{ asm volatile("atom.min.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.min.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.min.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.min.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.min.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_block_tag) +{ asm volatile("atom.min.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acquire.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.relaxed.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.release.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.acq_rel.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_cluster_tag) +{ asm volatile("atom.min.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.min.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.min.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.min.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.min.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_device_tag) +{ asm volatile("atom.min.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.min.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.min.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.min.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.min.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_system_tag) +{ asm volatile("atom.min.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } + +template +struct __cuda_atomic_bind_fetch_min { + _Type* __ptr; + _Type* __dst; + _Type* __op; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_fetch_min(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{}); + return __dst; +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{}); + return __dst; } -template -_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp = -__tmp; - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.or.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.or.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.or.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.or.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.or.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.or.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.or.release.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.or.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.or.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.or.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.or.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.or.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.or.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.or.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.or.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.or.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.or.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.or.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.or.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.or.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.or.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.or.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.or.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.or.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.or.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.or.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.or.release.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.or.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.or.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.or.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.or.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.or.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.or.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.or.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.or.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.or.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.or.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.or.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.or.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } + +template +struct __cuda_atomic_bind_fetch_or { + _Type* __ptr; + _Type* __dst; + _Type* __op; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_fetch_or(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{}); + return __dst; +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{}); + return __dst; } -template -_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.xor.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.xor.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.xor.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.xor.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag) +{ asm volatile("atom.xor.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.release.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.xor.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.xor.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.xor.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.xor.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag) +{ asm volatile("atom.xor.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.xor.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.xor.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.xor.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.xor.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag) +{ asm volatile("atom.xor.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.xor.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.xor.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.xor.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.xor.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag) +{ asm volatile("atom.xor.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.release.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag) +{ asm volatile("atom.xor.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.xor.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.xor.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.xor.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.xor.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag) +{ asm volatile("atom.xor.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.xor.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.xor.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.xor.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.xor.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor( + _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag) +{ asm volatile("atom.xor.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); } + +template +struct __cuda_atomic_bind_fetch_xor { + _Type* __ptr; + _Type* __dst; + _Type* __op; + + template + inline _CCCL_DEVICE void operator()(_Atomic_Memorder) { + __cuda_atomic_fetch_xor(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{}); + } +}; +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{}); + return __dst; +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy}; + __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{}); + return __dst; } -template -_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) { - _Type* __ret; - uint64_t __tmp = 0; - memcpy(&__tmp, &__val, 8); - __tmp = -__tmp; - __tmp *= sizeof(_Type); - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break; - } - ), - NV_IS_DEVICE, ( - switch (__memorder) { - case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH(); - case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH(); - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break; - default: assert(0); - } - ) - ) - memcpy(&__ret, &__tmp, 8); - return __ret; + +template +static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{}); } #endif // defined(_CCCL_CUDA_COMPILER) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h new file mode 100644 index 0000000000..861e9f7b08 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H +#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +enum class __atomic_cuda_memorder +{ + _relaxed, + _release, + _acquire, + _acq_rel, + _seq_cst, + _volatile, +}; + +template <__atomic_cuda_memorder _Order> +using __atomic_cuda_memorder_tag = integral_constant<__atomic_cuda_memorder, _Order>; + +using __atomic_cuda_relaxed = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_relaxed>; +using __atomic_cuda_release = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_release>; +using __atomic_cuda_acquire = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_acquire>; +using __atomic_cuda_acq_rel = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_acq_rel>; +using __atomic_cuda_seq_cst = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_seq_cst>; +using __atomic_cuda_volatile = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_volatile>; + +template +using __atomic_cuda_mmio_tag = integral_constant; + +using __atomic_cuda_mmio_enable = __atomic_cuda_mmio_tag; +using __atomic_cuda_mmio_disable = __atomic_cuda_mmio_tag; + +enum class __atomic_cuda_operand +{ + _f, + _s, + _u, + _b, +}; + +template <__atomic_cuda_operand _Op, size_t _Size> +struct __atomic_cuda_operand_tag +{}; + +using __atomic_cuda_operand_f16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 16>; +using __atomic_cuda_operand_s16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 16>; +using __atomic_cuda_operand_u16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 16>; +using __atomic_cuda_operand_b16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 16>; +using __atomic_cuda_operand_f32 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 32>; +using __atomic_cuda_operand_s32 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 32>; +using __atomic_cuda_operand_u32 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 32>; +using __atomic_cuda_operand_b32 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 32>; +using __atomic_cuda_operand_f64 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 64>; +using __atomic_cuda_operand_s64 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 64>; +using __atomic_cuda_operand_u64 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 64>; +using __atomic_cuda_operand_b64 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 64>; +using __atomic_cuda_operand_f128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 128>; +using __atomic_cuda_operand_s128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 128>; +using __atomic_cuda_operand_u128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 128>; +using __atomic_cuda_operand_b128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 128>; + +template +struct __atomic_cuda_operand_deduction +{ + using __type = _AtomicType; + using __tag = _OpTag; +}; + +struct __atomic_longlong2 +{ + uint64_t __x; + uint64_t __y; +}; + +template +using __atomic_cuda_deduce_bitwise = + _If, + _If, + _If, + __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>; + +template +using __atomic_cuda_deduce_arithmetic = + _If<_CCCL_TRAIT(is_floating_point, _Type), + _If, + __atomic_cuda_operand_deduction>, + _If<_CCCL_TRAIT(is_signed, _Type), + _If, + __atomic_cuda_operand_deduction>, // There is no atom.add.s64 + _If, + __atomic_cuda_operand_deduction>>>; + +template +using __atomic_cuda_deduce_minmax = + _If<_CCCL_TRAIT(is_signed, _Type), + _If, + __atomic_cuda_operand_deduction>, + _If, + __atomic_cuda_operand_deduction>>; + +template +using __atomic_enable_if_native_bitwise = bool; + +template +using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type; + +template +using __atomic_enable_if_not_native_arithmetic = typename enable_if::type; + +template +using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type; + +template +using __atomic_enable_if_not_native_minmax = typename enable_if::type; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h index e6aeaa36fc..736fd0b0b0 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/host.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -109,30 +110,6 @@ inline bool __atomic_compare_exchange_weak_host( __atomic_failure_order_to_int(__failure)); } -template -struct __atomic_ptr_skip -{ - static constexpr auto __skip = 1; -}; - -template -struct __atomic_ptr_skip<_Tp*> -{ - static constexpr auto __skip = sizeof(_Tp); -}; - -// FIXME: Haven't figured out what the spec says about using arrays with -// atomic_fetch_add. Force a failure rather than creating bad behavior. -template -struct __atomic_ptr_skip<_Tp[]> -{}; -template -struct __atomic_ptr_skip<_Tp[n]> -{}; - -template -using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>; - template ::value, int> = 0> inline __remove_cv_t<_Tp> __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order) { diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h index 70af777d5c..22637c186e 100644 --- a/libcudacxx/include/cuda/std/__atomic/scopes.h +++ b/libcudacxx/include/cuda/std/__atomic/scopes.h @@ -44,6 +44,8 @@ struct __thread_scope_thread_tag {}; struct __thread_scope_block_tag {}; +struct __thread_scope_cluster_tag +{}; struct __thread_scope_device_tag {}; struct __thread_scope_system_tag diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt index 856318015f..095fa41cf7 100644 --- a/libcudacxx/test/atomic_codegen/CMakeLists.txt +++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt @@ -1,5 +1,4 @@ -# For every atomic API compile the TU and check if the SASS matches the expected result -add_custom_target(libcudacxx.test.atomic_codegen) +add_custom_target(libcudacxx.test.atomics.ptx) find_program(filecheck "FileCheck" REQUIRED) find_program(cuobjdump "cuobjdump" REQUIRED) @@ -7,6 +6,7 @@ find_program(bash "bash" REQUIRED) file(GLOB libcudacxx_atomic_codegen_tests "*.cu") +# For every atomic API compile the TU and check if the SASS/PTX matches the expected result foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests) cmake_path(GET test_path FILENAME test_file) cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) @@ -18,11 +18,11 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests) ## Important for testing the local headers target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include") - add_dependencies(libcudacxx.test.atomic_codegen atomic_codegen_${test_name}) + add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name}) # Add output path to object directory add_custom_command( - TARGET libcudacxx.test.atomic_codegen + TARGET libcudacxx.test.atomics.ptx POST_BUILD COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/dump_and_check.bash $ ${test_path} SM8X ) diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu index ff2850009f..d97636d647 100644 --- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu @@ -11,11 +11,11 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]]( ; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; -; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; +; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; ; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.u32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; +; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; ; SM8X-NEXT: // ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu index 12eb48622b..983c8e9fac 100644 --- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu @@ -13,7 +13,7 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in) ; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_1]; ; SM8X-NEXT: // -; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]], %r[[#INPUT]]; +; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]]; ; SM8X-NEXT: // ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu index d32696e826..9d1ffaefa1 100644 --- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu @@ -11,12 +11,12 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*sub_relaxed_device_non_volatile.*]]( ; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; -; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; +; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]]; ; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.u32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]]; +; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]]; ; SM8X-NEXT: // ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; From 47b8f5ccdf46358b27fbf156b5dab509fc6ebdac Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Wed, 7 Aug 2024 12:40:56 -0700 Subject: [PATCH 08/33] [CUDAX] add `__launch_transform` to transform arguments to `cudax::launch` prior to launching the kernel (#2202) * add `__launch_transform` to transform arguments to `cudax::launch` prior to launching the kernel --- .../cuda/experimental/__detail/utility.cuh | 20 ++++ .../cuda/experimental/__launch/launch.cuh | 94 +++++++++++++------ .../__launch/launch_transform.cuh | 83 ++++++++++++++++ .../__utility/ensure_current_device.cuh | 2 +- cudax/test/launch/launch_smoke.cu | 56 +++++++++++ 5 files changed, 223 insertions(+), 32 deletions(-) create mode 100644 cudax/include/cuda/experimental/__launch/launch_transform.cuh diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh index 874075b107..738a5d6244 100644 --- a/cudax/include/cuda/experimental/__detail/utility.cuh +++ b/cudax/include/cuda/experimental/__detail/utility.cuh @@ -11,8 +11,28 @@ #ifndef __CUDAX_DETAIL_UTILITY_H #define __CUDAX_DETAIL_UTILITY_H +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + namespace cuda::experimental { +namespace detail +{ +struct __ignore +{ + template + _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept + {} +}; +} // namespace detail + struct uninit_t { explicit uninit_t() = default; diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh index 1a49cafa40..f4aee8a173 100644 --- a/cudax/include/cuda/experimental/__launch/launch.cuh +++ b/cudax/include/cuda/experimental/__launch/launch.cuh @@ -16,6 +16,7 @@ #include #include +#include #include #if _CCCL_STD_VER >= 2017 @@ -120,18 +121,25 @@ template & conf, const Kernel& kernel, Args... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); + __ensure_current_device __dev_setter(stream); cudaError_t status; - if constexpr (::cuda::std::is_invocable_v, Args...>) + if constexpr (::cuda::std::is_invocable_v, as_kernel_arg_t...>) { - auto launcher = detail::kernel_launcher, Kernel, Args...>; - status = detail::launch_impl(stream, conf, launcher, conf, kernel, args...); + auto launcher = detail::kernel_launcher, Kernel, as_kernel_arg_t...>; + status = detail::launch_impl( + stream, + conf, + launcher, + conf, + kernel, + static_cast>(detail::__launch_transform(stream, args))...); } else { - static_assert(::cuda::std::is_invocable_v); - auto launcher = detail::kernel_launcher_no_config; - status = detail::launch_impl(stream, conf, launcher, kernel, args...); + static_assert(::cuda::std::is_invocable_v...>); + auto launcher = detail::kernel_launcher_no_config...>; + status = detail::launch_impl( + stream, conf, launcher, kernel, static_cast>(detail::__launch_transform(stream, args))...); } if (status != cudaSuccess) { @@ -183,18 +191,29 @@ void launch( template void launch(::cuda::stream_ref stream, const hierarchy_dimensions& dims, const Kernel& kernel, Args... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); + __ensure_current_device __dev_setter(stream); cudaError_t status; - if constexpr (::cuda::std::is_invocable_v, Args...>) + if constexpr (::cuda::std::is_invocable_v, as_kernel_arg_t...>) { - auto launcher = detail::kernel_launcher, Kernel, Args...>; - status = detail::launch_impl(stream, kernel_config(dims), launcher, dims, kernel, args...); + auto launcher = detail::kernel_launcher, Kernel, as_kernel_arg_t...>; + status = detail::launch_impl( + stream, + kernel_config(dims), + launcher, + dims, + kernel, + static_cast>(detail::__launch_transform(stream, args))...); } else { - static_assert(::cuda::std::is_invocable_v); - auto launcher = detail::kernel_launcher_no_config; - status = detail::launch_impl(stream, kernel_config(dims), launcher, kernel, args...); + static_assert(::cuda::std::is_invocable_v...>); + auto launcher = detail::kernel_launcher_no_config...>; + status = detail::launch_impl( + stream, + kernel_config(dims), + launcher, + kernel, + static_cast>(detail::__launch_transform(stream, args))...); } if (status != cudaSuccess) { @@ -248,10 +267,14 @@ void launch(::cuda::stream_ref stream, void (*kernel)(kernel_config, ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, conf, kernel, conf, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, // + conf, + kernel, + conf, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); @@ -303,10 +326,14 @@ void launch(::cuda::stream_ref stream, void (*kernel)(hierarchy_dimensions, ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, kernel_config(dims), kernel, dims, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, + kernel_config(dims), + kernel, + dims, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); @@ -320,7 +347,6 @@ void launch(::cuda::stream_ref stream, * Kernel function is a function with __global__ annotation. * Function might or might not accept the configuration as its first argument. * - * * @par Snippet * @code * #include @@ -359,10 +385,13 @@ void launch(::cuda::stream_ref stream, void (*kernel)(ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, conf, kernel, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, // + conf, + kernel, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); @@ -412,10 +441,13 @@ template void launch( ::cuda::stream_ref stream, const hierarchy_dimensions& dims, void (*kernel)(ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, kernel_config(dims), kernel, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, + kernel_config(dims), + kernel, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh new file mode 100644 index 0000000000..4692cf9376 --- /dev/null +++ b/cudax/include/cuda/experimental/__launch/launch_transform.cuh @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__LAUNCH_LAUNCH_TRANSFORM +#define _CUDAX__LAUNCH_LAUNCH_TRANSFORM +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +#if _CCCL_STD_VER >= 2017 +namespace cuda::experimental +{ +namespace detail +{ +// Types should define overloads of __cudax_launch_transform that are find-able +// by ADL in order to customize how cudax::launch handles that type. The +// overload below, which simply returns the argument unmodified, is the overload +// that gets chosen if no other overload matches. It takes __ignore as the first +// argument to make this overload less preferred than other overloads that take +// a stream_ref as the first argument. +template +_CCCL_NODISCARD constexpr _Arg&& __cudax_launch_transform(__ignore, _Arg&& __arg) noexcept +{ + return _CUDA_VSTD::forward<_Arg>(__arg); +} + +template +using __launch_transform_direct_result_t = + decltype(__cudax_launch_transform(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>())); + +struct __fn +{ + template + _CCCL_NODISCARD __launch_transform_direct_result_t<_Arg> operator()(::cuda::stream_ref __stream, _Arg&& __arg) const + { + // This call is unqualified to allow ADL + return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg)); + } +}; + +template +struct __as_kernel_arg +{ + using type = _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>; +}; + +template +struct __as_kernel_arg< + _Arg, + _CUDA_VSTD::void_t>::__as_kernel_arg>> +{ + using type = typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg; +}; + +_CCCL_GLOBAL_CONSTANT __fn __launch_transform{}; +} // namespace detail + +template +using as_kernel_arg_t = typename detail::__as_kernel_arg<_Arg>::type; + +} // namespace cuda::experimental + +#endif // _CCCL_STD_VER >= 2017 +#endif // !_CUDAX__LAUNCH_LAUNCH_TRANSFORM diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh index 2431d02818..839adafb96 100644 --- a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh +++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh @@ -33,7 +33,7 @@ namespace cuda::experimental //! @brief RAII helper which on construction sets the current device to the specified one or one a //! stream was created under. It sets the state back on destruction. //! -struct __ensure_current_device +struct [[maybe_unused]] __ensure_current_device { //! @brief Construct a new `__ensure_current_device` object and switch to the specified //! device. diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index 810e65c390..29d84d2e7c 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -104,6 +104,50 @@ struct dynamic_smem_span } }; +struct launch_transform_to_int_convertible +{ + int value_; + + struct int_convertible + { + cudaStream_t stream_; + int value_; + + int_convertible(cudaStream_t stream, int value) noexcept + : stream_(stream) + , value_(value) + { + // Check that the constructor runs before the kernel is launched + CHECK_FALSE(kernel_run_proof); + } + + // Immovable to ensure that __launch_transform doesn't copy the returned + // object + int_convertible(int_convertible&&) = delete; + + ~int_convertible() noexcept + { + // Check that the destructor runs after the kernel is launched + CUDART(cudaStreamSynchronize(stream_)); + CHECK(kernel_run_proof); + } + + using __as_kernel_arg = int; + + // This is the value that will be passed to the kernel + explicit operator int() const + { + return value_; + } + }; + + _CCCL_NODISCARD_FRIEND int_convertible + __cudax_launch_transform(::cuda::stream_ref stream, launch_transform_to_int_convertible self) noexcept + { + return int_convertible(stream.get(), self.value_); + } +}; + // Needs a separe function for Windows extended lambda void launch_smoke_test() { @@ -127,10 +171,14 @@ void launch_smoke_test() check_kernel_run(stream); cudax::launch(stream, dims_or_conf, kernel_int_argument, 1); check_kernel_run(stream); + cudax::launch(stream, dims_or_conf, kernel_int_argument, launch_transform_to_int_convertible{1}); + check_kernel_run(stream); cudax::launch(stream, dims_or_conf, functor_int_argument(), dummy); check_kernel_run(stream); cudax::launch(stream, dims_or_conf, functor_int_argument(), 1); check_kernel_run(stream); + cudax::launch(stream, dims_or_conf, functor_int_argument(), launch_transform_to_int_convertible{1}); + check_kernel_run(stream); cudax::launch(stream, dims_or_conf, kernel_int_argument, 1U); check_kernel_run(stream); @@ -150,11 +198,15 @@ void launch_smoke_test() check_kernel_run(stream); cudax::launch(stream, config, functor_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, config, functor_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, config, kernel_instance, grid_size); check_kernel_run(stream); cudax::launch(stream, config, kernel_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, config, kernel_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, config, functor_instance, static_cast(grid_size)); check_kernel_run(stream); @@ -171,11 +223,15 @@ void launch_smoke_test() check_kernel_run(stream); cudax::launch(stream, dimensions, functor_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, dimensions, functor_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, dimensions, kernel_instance, grid_size); check_kernel_run(stream); cudax::launch(stream, dimensions, kernel_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, dimensions, kernel_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, dimensions, functor_instance, static_cast(grid_size)); check_kernel_run(stream); From 39fd05e334dda0c5e4f4f75cd7ac44591e3ffdcd Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Thu, 8 Aug 2024 00:32:06 -0700 Subject: [PATCH 09/33] Cleanup common testing headers and correct asserts in launch testing (#2204) * Cleanup common testing headers * Add test/common to cmake and fix formatting --- cudax/test/CMakeLists.txt | 1 + .../host_device.cuh} | 65 +---------------- cudax/test/common/testing.cuh | 73 +++++++++++++++++++ cudax/test/common/utility.cuh | 7 +- cudax/test/device/device_smoke.cu | 2 +- cudax/test/event/event_smoke.cu | 3 +- .../test/hierarchy/hierarchy_custom_types.cu | 2 +- cudax/test/hierarchy/hierarchy_smoke.cu | 2 +- cudax/test/launch/configuration.cu | 2 +- cudax/test/launch/launch_smoke.cu | 12 +-- cudax/test/stream/get_stream.cu | 2 +- cudax/test/stream/stream_smoke.cu | 2 +- cudax/test/utility/driver_api.cu | 2 +- cudax/test/utility/ensure_current_device.cu | 2 +- 14 files changed, 98 insertions(+), 79 deletions(-) rename cudax/test/{hierarchy/testing_common.cuh => common/host_device.cuh} (63%) create mode 100644 cudax/test/common/testing.cuh diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 4752f8b964..cda6623668 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -26,6 +26,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test set(test_sources ${ARGN}) add_executable(${test_target} ${test_sources}) + target_include_directories(${test_target} PRIVATE "common") target_link_libraries(${test_target} PRIVATE ${cn_target} Catch2::Catch2 catch2_main) target_link_libraries(${test_target} PRIVATE ${cn_target} cudax::Thrust) target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE") diff --git a/cudax/test/hierarchy/testing_common.cuh b/cudax/test/common/host_device.cuh similarity index 63% rename from cudax/test/hierarchy/testing_common.cuh rename to cudax/test/common/host_device.cuh index 96dcef7369..b20ff7e923 100644 --- a/cudax/test/hierarchy/testing_common.cuh +++ b/cudax/test/common/host_device.cuh @@ -8,67 +8,10 @@ // //===----------------------------------------------------------------------===// -#ifndef __TESTING_COMMON_H__ -#define __TESTING_COMMON_H__ +#ifndef __COMMON_HOST_DEVICE_H__ +#define __COMMON_HOST_DEVICE_H__ -#include - -#include -#include -#include - -#include - -namespace cudax = cuda::experimental; - -#define CUDART(call) REQUIRE((call) == cudaSuccess) - -inline void __device__ cudax_require_impl( - bool condition, const char* condition_text, const char* filename, unsigned int linenum, const char* funcname) -{ - if (!condition) - { - // TODO do warp aggregate prints for easier readibility? - printf("%s:%u: %s: block: [%d,%d,%d], thread: [%d,%d,%d] Condition `%s` failed.\n", - filename, - linenum, - funcname, - blockIdx.x, - blockIdx.y, - blockIdx.z, - threadIdx.x, - threadIdx.y, - threadIdx.z, - condition_text); - __trap(); - } -} - -// TODO make it work on NVC++ -#ifdef __CUDA_ARCH__ -# define CUDAX_REQUIRE(condition) cudax_require_impl(condition, #condition, __FILE__, __LINE__, __PRETTY_FUNCTION__); -#else -# define CUDAX_REQUIRE REQUIRE -#endif - -bool constexpr __host__ __device__ operator==(const dim3& lhs, const dim3& rhs) -{ - return (lhs.x == rhs.x) && (lhs.y == rhs.y) && (lhs.z == rhs.z); -} - -namespace Catch -{ -template <> -struct StringMaker -{ - static std::string convert(dim3 const& dims) - { - std::ostringstream oss; - oss << "(" << dims.x << ", " << dims.y << ", " << dims.z << ")"; - return oss.str(); - } -}; -} // namespace Catch +#include "testing.cuh" template void __global__ lambda_launcher(const Dims dims, const Lambda lambda) @@ -155,4 +98,4 @@ void apply_each(const Fn& fn, const Tuple& tuple) tuple); } -#endif +#endif // __COMMON_HOST_DEVICE_H__ diff --git a/cudax/test/common/testing.cuh b/cudax/test/common/testing.cuh new file mode 100644 index 0000000000..ca4537fd78 --- /dev/null +++ b/cudax/test/common/testing.cuh @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __COMMON_TESTING_H__ +#define __COMMON_TESTING_H__ + +#include + +#include +#include +#include + +#include + +namespace cudax = cuda::experimental; + +#define CUDART(call) REQUIRE((call) == cudaSuccess) + +inline void __device__ cudax_require_impl( + bool condition, const char* condition_text, const char* filename, unsigned int linenum, const char* funcname) +{ + if (!condition) + { + // TODO do warp aggregate prints for easier readibility? + printf("%s:%u: %s: block: [%d,%d,%d], thread: [%d,%d,%d] Condition `%s` failed.\n", + filename, + linenum, + funcname, + blockIdx.x, + blockIdx.y, + blockIdx.z, + threadIdx.x, + threadIdx.y, + threadIdx.z, + condition_text); + __trap(); + } +} + +// TODO make it work on NVC++ +#ifdef __CUDA_ARCH__ +# define CUDAX_REQUIRE(condition) cudax_require_impl(condition, #condition, __FILE__, __LINE__, __PRETTY_FUNCTION__); +#else +# define CUDAX_REQUIRE REQUIRE +#endif + +bool constexpr __host__ __device__ operator==(const dim3& lhs, const dim3& rhs) +{ + return (lhs.x == rhs.x) && (lhs.y == rhs.y) && (lhs.z == rhs.z); +} + +namespace Catch +{ +template <> +struct StringMaker +{ + static std::string convert(dim3 const& dims) + { + std::ostringstream oss; + oss << "(" << dims.x << ", " << dims.y << ", " << dims.z << ")"; + return oss.str(); + } +}; +} // namespace Catch + +#endif // __COMMON_TESTING_H__ diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh index 64a54e1b48..991d8fd25f 100644 --- a/cudax/test/common/utility.cuh +++ b/cudax/test/common/utility.cuh @@ -8,6 +8,9 @@ // //===----------------------------------------------------------------------===// +#ifndef __COMMON_UTILITY_H__ +#define __COMMON_UTILITY_H__ + #include // cuda_runtime_api needs to come first @@ -18,8 +21,7 @@ #include // IWYU pragma: keep (needed for placement new) -// TODO unify the common testing header -#include "../hierarchy/testing_common.cuh" +#include "testing.cuh" namespace { @@ -174,3 +176,4 @@ inline void empty_driver_stack() } // namespace test } // namespace +#endif // __COMMON_UTILITY_H__ diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu index b98d05fc3b..d13400c8db 100644 --- a/cudax/test/device/device_smoke.cu +++ b/cudax/test/device/device_smoke.cu @@ -10,8 +10,8 @@ #include -#include "../hierarchy/testing_common.cuh" #include "cuda/std/__type_traits/is_same.h" +#include namespace { diff --git a/cudax/test/event/event_smoke.cu b/cudax/test/event/event_smoke.cu index ae5286a4f7..ddf9b271d1 100644 --- a/cudax/test/event/event_smoke.cu +++ b/cudax/test/event/event_smoke.cu @@ -10,9 +10,8 @@ #include -#include "../common/utility.cuh" -#include "../hierarchy/testing_common.cuh" #include +#include namespace { diff --git a/cudax/test/hierarchy/hierarchy_custom_types.cu b/cudax/test/hierarchy/hierarchy_custom_types.cu index 5b06959eea..f35a4914ce 100644 --- a/cudax/test/hierarchy/hierarchy_custom_types.cu +++ b/cudax/test/hierarchy/hierarchy_custom_types.cu @@ -10,8 +10,8 @@ #include -#include "testing_common.cuh" #include +#include struct custom_level : public cudax::hierarchy_level { diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu index f6f55cf9f0..b43a077b79 100644 --- a/cudax/test/hierarchy/hierarchy_smoke.cu +++ b/cudax/test/hierarchy/hierarchy_smoke.cu @@ -10,8 +10,8 @@ #include -#include "testing_common.cuh" #include +#include namespace cg = cooperative_groups; diff --git a/cudax/test/launch/configuration.cu b/cudax/test/launch/configuration.cu index 9e7f98df1b..693d00ce16 100644 --- a/cudax/test/launch/configuration.cu +++ b/cudax/test/launch/configuration.cu @@ -14,7 +14,7 @@ #include #undef cudaLaunchKernelEx -#include "../hierarchy/testing_common.cuh" +#include static cudaLaunchConfig_t expectedConfig; static bool replacementCalled = false; diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index 29d84d2e7c..e9c6e7730a 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -11,7 +11,7 @@ #include -#include "../hierarchy/testing_common.cuh" +#include __managed__ bool kernel_run_proof = false; @@ -37,7 +37,7 @@ struct functor_taking_config __device__ void operator()(Config conf, int grid_size) { static_assert(conf.dims.static_count(cudax::thread, cudax::block) == BlockSize); - assert(conf.dims.count(cudax::block, cudax::grid) == grid_size); + CUDAX_REQUIRE(conf.dims.count(cudax::block, cudax::grid) == grid_size); kernel_run_proof = true; } }; @@ -49,7 +49,7 @@ struct functor_taking_dims __device__ void operator()(Dimensions dims, int grid_size) { static_assert(dims.static_count(cudax::thread, cudax::block) == BlockSize); - assert(dims.count(cudax::block, cudax::grid) == grid_size); + CUDAX_REQUIRE(dims.count(cudax::block, cudax::grid) == grid_size); kernel_run_proof = true; } }; @@ -84,7 +84,7 @@ struct dynamic_smem_single { auto& dynamic_smem = cudax::dynamic_smem_ref(conf); static_assert(::cuda::std::is_same_v); - assert(__isShared(&dynamic_smem)); + CUDAX_REQUIRE(__isShared(&dynamic_smem)); kernel_run_proof = true; } }; @@ -98,8 +98,8 @@ struct dynamic_smem_span auto dynamic_smem = cudax::dynamic_smem_span(conf); static_assert(decltype(dynamic_smem)::extent == Extent); static_assert(::cuda::std::is_same_v); - assert(dynamic_smem.size() == size); - assert(__isShared(&dynamic_smem[1])); + CUDAX_REQUIRE(dynamic_smem.size() == size); + CUDAX_REQUIRE(__isShared(&dynamic_smem[1])); kernel_run_proof = true; } }; diff --git a/cudax/test/stream/get_stream.cu b/cudax/test/stream/get_stream.cu index 0654c3be39..80834bdd0f 100644 --- a/cudax/test/stream/get_stream.cu +++ b/cudax/test/stream/get_stream.cu @@ -10,8 +10,8 @@ #include -#include "../common/utility.cuh" #include +#include TEST_CASE("Can call get_stream on a cudaStream_t", "[stream]") { diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu index cbee352080..50e55352a4 100644 --- a/cudax/test/stream/stream_smoke.cu +++ b/cudax/test/stream/stream_smoke.cu @@ -11,8 +11,8 @@ #include #include -#include "../common/utility.cuh" #include +#include constexpr auto one_thread_dims = cudax::make_hierarchy(cudax::block_dims<1>(), cudax::grid_dims<1>()); diff --git a/cudax/test/utility/driver_api.cu b/cudax/test/utility/driver_api.cu index e5fd64d14f..5955802fe1 100644 --- a/cudax/test/utility/driver_api.cu +++ b/cudax/test/utility/driver_api.cu @@ -10,7 +10,7 @@ #include -#include "../hierarchy/testing_common.cuh" +#include TEST_CASE("Call each driver api", "[utility]") { diff --git a/cudax/test/utility/ensure_current_device.cu b/cudax/test/utility/ensure_current_device.cu index 89efc7d4f6..cdf8effcd6 100644 --- a/cudax/test/utility/ensure_current_device.cu +++ b/cudax/test/utility/ensure_current_device.cu @@ -13,7 +13,7 @@ #include #include -#include "../common/utility.cuh" +#include namespace driver = cuda::experimental::detail::driver; From c9a7b6ad1b41b7b382ee6945ca7b44b687aa29b9 Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Thu, 8 Aug 2024 00:41:04 -0700 Subject: [PATCH 10/33] [CUDAX] Add an API to get device_ref from stream and add comparison operator to device_ref (#2203) * Add a way to compare device_refs * Add a way to query device_ref from a stream * Fix Windows missing cast * Apply suggestions from code review Co-authored-by: Michael Schellenberger Costa * Disallow device comparision with int --------- Co-authored-by: Michael Schellenberger Costa --- .../cuda/experimental/__device/device.cuh | 8 ++++++ .../cuda/experimental/__device/device_ref.cuh | 28 +++++++++++++++++++ .../cuda/experimental/__event/event_ref.cuh | 8 +++--- .../cuda/experimental/__stream/stream.cuh | 13 +++++++++ cudax/test/device/device_smoke.cu | 16 +++++++++++ cudax/test/stream/stream_smoke.cu | 12 ++++++++ 6 files changed, 81 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh index 5532e8f59b..145ce4c10e 100644 --- a/cudax/include/cuda/experimental/__device/device.cuh +++ b/cudax/include/cuda/experimental/__device/device.cuh @@ -117,6 +117,14 @@ private: device(const device&) = delete; device& operator=(device&&) = delete; device& operator=(const device&) = delete; + + friend bool operator==(const device& __lhs, int __rhs) = delete; + friend bool operator==(int __lhs, const device& __rhs) = delete; + +#if _CCCL_STD_VER <= 2017 + friend bool operator!=(const device& __lhs, int __rhs) = delete; + friend bool operator!=(int __lhs, const device& __rhs) = delete; +#endif // _CCCL_STD_VER <= 2017 }; namespace detail diff --git a/cudax/include/cuda/experimental/__device/device_ref.cuh b/cudax/include/cuda/experimental/__device/device_ref.cuh index 91e4e90caa..5c7b89779e 100644 --- a/cudax/include/cuda/experimental/__device/device_ref.cuh +++ b/cudax/include/cuda/experimental/__device/device_ref.cuh @@ -54,6 +54,34 @@ public: return __id_; } + //! @brief Compares two `device_ref`s for equality + //! + //! @note Allows comparison with `int` due to implicit conversion to + //! `device_ref`. + //! + //! @param __lhs The first `device_ref` to compare + //! @param __rhs The second `device_ref` to compare + //! @return true if `lhs` and `rhs` refer to the same device ordinal + _CCCL_NODISCARD_FRIEND constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept + { + return __lhs.__id_ == __rhs.__id_; + } + +#if _CCCL_STD_VER <= 2017 + //! @brief Compares two `device_ref`s for inequality + //! + //! @note Allows comparison with `int` due to implicit conversion to + //! `device_ref`. + //! + //! @param __lhs The first `device_ref` to compare + //! @param __rhs The second `device_ref` to compare + //! @return true if `lhs` and `rhs` refer to different device ordinal + _CCCL_NODISCARD_FRIEND constexpr bool operator!=(device_ref __lhs, device_ref __rhs) noexcept + { + return __lhs.__id_ != __rhs.__id_; + } +#endif // _CCCL_STD_VER <= 2017 + //! @brief Retrieve the specified attribute for the device //! //! @param __attr The attribute to query. See `device::attrs` for the available diff --git a/cudax/include/cuda/experimental/__event/event_ref.cuh b/cudax/include/cuda/experimental/__event/event_ref.cuh index 3b0ccc6dbc..bf1c1b398c 100644 --- a/cudax/include/cuda/experimental/__event/event_ref.cuh +++ b/cudax/include/cuda/experimental/__event/event_ref.cuh @@ -111,8 +111,8 @@ public: //! @note Allows comparison with `cudaEvent_t` due to implicit conversion to //! `event_ref`. //! - //! @param lhs The first `event_ref` to compare - //! @param rhs The second `event_ref` to compare + //! @param __lhs The first `event_ref` to compare + //! @param __rhs The second `event_ref` to compare //! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object. _CCCL_NODISCARD_FRIEND constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept { @@ -124,8 +124,8 @@ public: //! @note Allows comparison with `cudaEvent_t` due to implicit conversion to //! `event_ref`. //! - //! @param lhs The first `event_ref` to compare - //! @param rhs The second `event_ref` to compare + //! @param __lhs The first `event_ref` to compare + //! @param __rhs The second `event_ref` to compare //! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects. _CCCL_NODISCARD_FRIEND constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept { diff --git a/cudax/include/cuda/experimental/__stream/stream.cuh b/cudax/include/cuda/experimental/__stream/stream.cuh index 0ba125269b..27f0f698db 100644 --- a/cudax/include/cuda/experimental/__stream/stream.cuh +++ b/cudax/include/cuda/experimental/__stream/stream.cuh @@ -161,6 +161,19 @@ struct stream : stream_ref wait(__tmp); } + //! @brief Get device under which this stream was created. + //! + //! @throws cuda_error if device check fails + device_ref device() const + { + // Because the stream can come from_native_handle, we can't just loop over devices comparing contexts, + // lower to CUDART for this instead + __ensure_current_device __dev_setter(*this); + int result; + _CCCL_TRY_CUDA_API(cudaGetDevice, "Could not get device from a stream", &result); + return result; + } + //! @brief Construct an `stream` object from a native `cudaStream_t` handle. //! //! @param __handle The native handle diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu index d13400c8db..f725bc7f35 100644 --- a/cudax/test/device/device_smoke.cu +++ b/cudax/test/device/device_smoke.cu @@ -35,6 +35,16 @@ TEST_CASE("Smoke", "[device]") using cudax::device; using cudax::device_ref; + SECTION("Compare") + { + CUDAX_REQUIRE(device_ref{0} == device_ref{0}); + CUDAX_REQUIRE(device_ref{0} == 0); + CUDAX_REQUIRE(0 == device_ref{0}); + CUDAX_REQUIRE(device_ref{1} != device_ref{0}); + CUDAX_REQUIRE(device_ref{1} != 2); + CUDAX_REQUIRE(1 != device_ref{2}); + } + SECTION("Attributes") { ::test_device_attribute(); @@ -272,13 +282,19 @@ TEST_CASE("global devices vector", "[device]") CUDAX_REQUIRE(cudax::devices.size() == static_cast(cudax::devices.end() - cudax::devices.begin())); CUDAX_REQUIRE(0 == cudax::devices[0].get()); + CUDAX_REQUIRE(cudax::device_ref{0} == cudax::devices[0]); + CUDAX_REQUIRE(0 == (*cudax::devices.begin()).get()); + CUDAX_REQUIRE(cudax::device_ref{0} == *cudax::devices.begin()); + CUDAX_REQUIRE(0 == cudax::devices.begin()->get()); CUDAX_REQUIRE(0 == cudax::devices.begin()[0].get()); if (cudax::devices.size() > 1) { CUDAX_REQUIRE(1 == cudax::devices[1].get()); + CUDAX_REQUIRE(cudax::device_ref{0} != cudax::devices[1].get()); + CUDAX_REQUIRE(1 == (*std::next(cudax::devices.begin())).get()); CUDAX_REQUIRE(1 == std::next(cudax::devices.begin())->get()); CUDAX_REQUIRE(1 == cudax::devices.begin()[1].get()); diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu index 50e55352a4..90d7743810 100644 --- a/cudax/test/stream/stream_smoke.cu +++ b/cudax/test/stream/stream_smoke.cu @@ -102,3 +102,15 @@ TEST_CASE("Stream priority", "[stream]") cudax::stream stream(0, priority); CUDAX_REQUIRE(stream.priority() == priority); } + +TEST_CASE("Stream get device", "[stream]") +{ + cudax::stream dev0_stream(cudax::device_ref{0}); + CUDAX_REQUIRE(dev0_stream.device() == 0); + + cudaSetDevice(static_cast(cudax::devices.size() - 1)); + cudaStream_t stream_handle; + CUDART(cudaStreamCreate(&stream_handle)); + auto stream_cudart = cudax::stream::from_native_handle(stream_handle); + CUDAX_REQUIRE(stream_cudart.device() == *std::prev(cudax::devices.end())); +} From 3ebf8cc873500d5e22ab0a8957b2baa32035f583 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 8 Aug 2024 11:52:11 -0500 Subject: [PATCH 11/33] Update devcontainer docs for WSL (#2200) * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * [pre-commit.ci] auto code formatting * Why was 6 afraid of 7? --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .devcontainer/README.md | 209 +++++++++++++++++++++++----------------- 1 file changed, 121 insertions(+), 88 deletions(-) diff --git a/.devcontainer/README.md b/.devcontainer/README.md index d854931292..17486a4065 100644 --- a/.devcontainer/README.md +++ b/.devcontainer/README.md @@ -1,26 +1,32 @@ > **Note** -> The instructions in this README are specific to Linux development environments. Instructions for Windows are coming soon! +> The instructions in this README are specific to Linux development environments (including WSL on Windows). Instructions for native Windows development (e.g., `msvc`) are coming soon! [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) # CCCL Dev Containers -CCCL uses [Development Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. This guide covers setup in [Visual Studio Code](#quickstart-vscode-recommended) and [Docker](#quickstart-docker-manual-approach). The guide also provides additional instructions in case you want use WSL. +CCCL uses [Dev Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. -## Table of Contents -1. [Quickstart: VSCode (Recommended)](#vscode) -2. [Quickstart: Docker (Manual Approach)](#docker) -3. [Quickstart: Using WSL](#wsl) +VSCode offers the most convenient experience with Dev Containers due to its tight native integration, however, our containers are also fully usable without VSCode by leveraging Docker directly. -## Quickstart: VSCode (Recommended) +## Table of Contents +1. [Quickstart: VSCode on Linux (Recommended)](#vscode) +2. [Quickstart: VSCode on WSL (Recommended for Windows)](#wsl) +3. [Quickstart: Docker on Linux (Manual Approach)](#docker) +## Quickstart: VSCode on Linux (Recommended) ### Prerequisites - [Visual Studio Code](https://code.visualstudio.com/) - [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - [Docker](https://docs.docker.com/engine/install/) - This is only for completeness because it should already be implicitly installed by the Dev Containers extension -### Steps +#### GPU Prerequisites (only needed for executing tests that require a GPU) +- Supported NVIDIA GPU +- [NVIDIA Driver](https://www.nvidia.com/Download/index.aspx?lang=en-us) +- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) + +### Steps 1. Clone the Repository ```bash @@ -32,7 +38,7 @@ CCCL uses [Development Containers](https://containers.dev/) to provide consisten ![Shows "Reopen in Container" prompt when opening the cccl directory in VScode.](./img/reopen_in_container.png) - - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it. + - Alternatively, use `ctrl+shift+p` to open the Command Palette and type "Remote-Containers: Reopen in Container" and select it. ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png) @@ -42,11 +48,14 @@ CCCL uses [Development Containers](https://containers.dev/) to provide consisten 5. VSCode will initialize the selected Dev Container. This can take a few minutes the first time. -6. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent. +6. (Optional) Authenticate with GitHub + - After container startup, you will be asked if you would like to authenticate with GitHub. This is for access to CCCL's distributed `sccache` storage. If you are not an NVIDIA employee, you can safely ignore this step. For more information, see the [`sccache`](#sccache) section below. -7. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests. +7. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent. -### (Optional) Authenticate with GitHub for `sccache` +8. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests. + +### (Optional) Authenticate with GitHub for `sccache` After starting the container, there will be a prompt to authenticate with GitHub. This grants access to a [`sccache`](https://github.com/mozilla/sccache) server shared with CI and greatly accelerates local build times. This is currently limited to NVIDIA employees belonging to the `NVIDIA` or `rapidsai` GitHub organizations. @@ -60,11 +69,110 @@ To manually trigger this authentication, execute the `devcontainer-utils-vault-s For more information about the sccache configuration and authentication, see the documentation at [`rapidsai/devcontainers`](https://github.com/rapidsai/devcontainers/blob/branch-23.10/USAGE.md#build-caching-with-sccache). +## Quickstart: VSCode on WSL (Recommended for Windows) + +Windows Subsystem for Linux (WSL) enables you to run a Linux environment directly in Windows. +This isn't for native Windows development (e.g., compiling with `msvc`), but effectively a more convenient option than setting up a dual-boot Linux/Windows machine. +Apart from the initial setup of WSL, the process for using CCCL's Dev Containers in WSL is effectively the same as the instructions for Linux, because WSL _is_ Linux. + +### Prerequisites +- Windows OS that supports WSL 2 (Windows 11 or newer) +- [Windows Subsystem for Linux v2 (WSL 2)](https://learn.microsoft.com/en-us/windows/wsl/install) +- [Visual Studio Code](https://code.visualstudio.com/) (installed on Windows host) +- [VSCode Remote Development Extension Pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) (installed on Windows host) + - Includes [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) and [WSL](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-wsl) extensions +- [Docker](https://docs.docker.com/engine/install/) - (Will be installed automatically by the Remote Development extension) + +#### GPU Prerequisites (only needed for executing tests that require a GPU) +- Supported NVIDIA GPU +- [NVIDIA Driver](https://www.nvidia.com/Download/index.aspx?lang=en-us) (installed on Windows host) +- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) (**installed inside WSL**) + +For more details see the official NVIDIA [Getting Started with CUDA on WSL guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html#getting-started-with-cuda-on-wsl-2). + +### Install WSL on your Windows host +Refer to [Microsoft's documentation](https://learn.microsoft.com/en-us/windows/wsl/install) for the full instructions to install WSL2. + +
+ Click here for the TL;DR version +1. Run `Powershell` as an administrator +![image](https://github.com/user-attachments/assets/2c985887-ca6c-46bc-9e1b-f235ccfd8513) + +2. Install WSL 2 by running: +```bash +> wsl --install +``` +3. Restart your computer +4. If this is your first time installing WSL, upon restarting, it will prompt you to create a username/password to use inside WSL. +5. Verify `wsl` was succesfully installed by opening Powershell again and run +```bash +> wsl -l -v + NAME STATE VERSION +* Ubuntu Running 2 +``` +5. Launch `wsl` and verify your Linux environment +``` +# In Powershell, start WSL, which will drop you into a terminal session running in Linux +> wsl + +# In the new terminal session, verify your Linux environment by changing to your home directory +# and displaying the current directory. This should show `/home/*YOUR USER NAME*` +> cd ~ +> pwd +/home/jhemstad +``` + +Congratulations! You now have WSL installed and can use it as you would a normal Ubuntu/Linux installation. +This is sufficient for *building* CCCL's tests, if you have a GPU on your system and you would like to use it to run the tests, continue below: + +6. (Optional) Install `nvidia-container-toolkit` +See [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) for full instructions. + +**Important:** `nvidia-container-toolkit` needs to be installed inside WSL (not on the Windows host). The following commands should be run within the Linux environment. + +```bash +$ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +$ sudo apt-get update +$ sudo apt-get install -y nvidia-container-toolkit +``` + +Then configure Docker to use the `nvidia-container-toolkit`: +```bash +$ sudo nvidia-ctk runtime configure --runtime=docker +$ sudo systemctl restart docker +``` + +7. (Optional) Verify your GPU is available inside WSL +Use `nvidia-smi` inside of WSL to verify that your GPU is correctly configured and available from inside the container. +If not, verify that the NVIDIA GPU driver is correctly installed on your Windows host and `nvidia-container-toolkit` was successfully installed inside of WSL. +```bash +$ nvidia-smi +``` +
+ +### Connect VSCode to WSL +1. Launch VSCode on your Windows host + +2. Connect VSCode to your WSL instance +- Enter `ctrl + shift + p` to open the command prompt and type "WSL" and click "WSL: Connect to WSL" + - If you don't see this option, you need to install the [WSL VSCode Extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-wsl) (comes with the [Remote Development pack ](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack)) +![image](https://github.com/user-attachments/assets/3e0e6af7-4251-4ce9-9204-589ad7daa12a) + - To verify VSCode is connected to WSL, you should see the following in the bottom left corner: ![Shows the WSL: Ubuntu status for a successful connection to WSL.](https://github.com/user-attachments/assets/26dbba61-cc96-4ac3-8200-fdb26a8e4a4b) + +3. VSCode is now attached to WSL and it is equivalent to running in a native Linux environment. You can now proceed as described in the [section above](#vscode-devcontainer-steps). + ## Quickstart: Docker (Manual Approach) ### Prerequisites +- [Docker](https://docs.docker.com/engine/install/) + +#### GPU Prerequisites (only needed for executing tests that require a GPU) +- Supported NVIDIA GPU +- [NVIDIA Driver](https://www.nvidia.com/Download/index.aspx?lang=en-us) - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) -- [Docker](https://docs.docker.com/desktop/install/linux-install/) ### Steps 1. Clone the repository and use the [`launch.sh`](./launch.sh) script to launch the default container environment @@ -122,78 +230,3 @@ Click the badge above or [click here](https://codespaces.new/NVIDIA/cccl?quickst For more information, see the `.devcontainer/make_devcontainers.sh --help` message. **Note**: When adding or updating supported environments, modify `matrix.yaml` and then rerun this script to synchronize the `devcontainer` configurations. - -## Quickstart: Using WSL - -> [!NOTE] -> _Make sure you have the Nvidia driver installed on your Windows host before moving further_. Type in `nvidia-smi` for verification. - -### Install WSL on your Windows host - -> [!WARNING] -> Disclaimer: This guide was developed for WSL 2 on Windows 11. - -1. Launch a Windows terminal (_e.g. Powershell_) as an administrator. - -2. Install WSL 2 by running: -```bash -wsl --install -``` -This should probably install Ubuntu distro as a default. - -3. Restart your computer and run `wsl -l -v` on a Windows terminal to verify installation. - -

Install prerequisites and VS Code extensions

- -4. Launch your WSL/Ubuntu terminal by running `wsl` in Powershell. - -5. Install the [WSL extension](ms-vscode-remote.remote-wsl) on VS Code. - - - `Ctrl + Shift + P` and select `WSL: Connect to WSL` (it will prompt you to install the WSL extension). - - - Make sure you are connected to WSL with VS Code by checking the bottom left corner of the VS Code window (should indicate "WSL: Ubuntu" in our case). - -6. Install the [Dev Containers extension](ms-vscode-remote.remote-containers) on VS Code. - - - In a vanilla system you should be prompted to install `Docker` at this point, accept it. If it hangs you might have to restart VS Code after that. - -7. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). **Make sure you install the WSL 2 version and not the native Linux one**. This builds on top of Docker so make sure you have Docker properly installed (run `docker --version`). - -8. Open `/etc/docker/daemon.json` from within your WSL system (if the file does not exist, create it) and add the following: - -```json -{ - "runtimes": { - "nvidia": { - "path": "nvidia-container-runtime", - "runtimeArgs": [] - } - } -} -``` - -then run `sudo systemctl restart docker.service`. - ---- -### Build CCCL in WSL using Dev Containers - -9. Still on your WSL terminal run `git clone https://github.com/NVIDIA/cccl.git` - - -10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located). - -11. If prompted, choose `Reopen in Container`. - - - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`. - -12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order. - -From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration: - -13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message. - -> Failed opening a web browser at https://github.com/login/device - exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH - Please try entering the URL in your browser manually - -In that case type in the address manually in your web browser https://github.com/login/device and fill in the one-time code. From f95f2113c32cc228df165137ad62743a180731f0 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Thu, 8 Aug 2024 17:33:11 -0700 Subject: [PATCH 12/33] add `cudax::distribute(numElements)` as a way to evenly distribute elements over thread blocks (#2210) --- .../__hierarchy/hierarchy_dimensions.cuh | 26 +++++++++++++++++++ cudax/test/hierarchy/hierarchy_smoke.cu | 10 +++++++ 2 files changed, 36 insertions(+) diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh index 48d4b38b1d..3cbad5f0a9 100644 --- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh @@ -839,6 +839,32 @@ constexpr auto hierarchy_add_level(const hierarchy_dimensions_fragment(level); } +/** + * @brief A shorthand for creating a hierarchy of CUDA threads by evenly + * distributing elements among blocks and threads. + * + * @par Snippet + * @code + * #include + * using namespace cuda::experimental; + * + * constexpr int threadsPerBlock = 256; + * auto dims = distribute(numElements); + * + * // Equivalent to: + * constexpr int threadsPerBlock = 256; + * int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + * auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims()); + * @endcode + */ +template +constexpr auto distribute(int numElements) noexcept +{ + int blocksPerGrid = (numElements + _ThreadsPerBlock - 1) / _ThreadsPerBlock; + return ::cuda::experimental::make_hierarchy( + ::cuda::experimental::grid_dims(blocksPerGrid), ::cuda::experimental::block_dims<_ThreadsPerBlock>()); +} + } // namespace cuda::experimental #endif // _CCCL_STD_VER >= 2017 #endif // _CUDAX__HIERARCHY_HIERARCHY_DIMENSIONS diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu index b43a077b79..fc78ca4504 100644 --- a/cudax/test/hierarchy/hierarchy_smoke.cu +++ b/cudax/test/hierarchy/hierarchy_smoke.cu @@ -512,3 +512,13 @@ TEST_CASE("Trivially constructable", "[hierarchy]") // static_assert(std::is_trivially_copyable_v(), // cudax::grid_dims<256>()))>); } + +TEST_CASE("cudax::distribute", "[hierarchy]") +{ + int numElements = 50000; + constexpr int threadsPerBlock = 256; + auto dims = cudax::distribute(numElements); + + CUDAX_REQUIRE(dims.count(cudax::thread, cudax::block) == 256); + CUDAX_REQUIRE(dims.count(cudax::block, cudax::grid) == (numElements + threadsPerBlock - 1) / threadsPerBlock); +} From 8e20c9a3cde9c725df40a91f83ea2ab66f5d40a3 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 9 Aug 2024 17:14:51 +0200 Subject: [PATCH 13/33] Rework mdspan concept emulation (#2213) It is proving difficult to handle for msvc and also the one we are using in libcu++ it much cleaner Gets #2160 compiling on MSVC --- .../cuda/std/__mdspan/default_accessor.h | 7 +- .../include/cuda/std/__mdspan/extents.h | 95 +++++----- .../include/cuda/std/__mdspan/layout_left.h | 27 ++- .../include/cuda/std/__mdspan/layout_right.h | 27 ++- .../include/cuda/std/__mdspan/layout_stride.h | 111 ++++++------ libcudacxx/include/cuda/std/__mdspan/macros.h | 162 +----------------- libcudacxx/include/cuda/std/__mdspan/mdspan.h | 160 +++++++---------- .../include/cuda/std/__mdspan/static_array.h | 20 +-- .../include/cuda/std/__mdspan/submdspan.h | 22 +-- .../views/mdspan/foo_customizations.hpp | 22 +-- 10 files changed, 216 insertions(+), 437 deletions(-) diff --git a/libcudacxx/include/cuda/std/__mdspan/default_accessor.h b/libcudacxx/include/cuda/std/__mdspan/default_accessor.h index 33bef7cb07..ea0924915d 100644 --- a/libcudacxx/include/cuda/std/__mdspan/default_accessor.h +++ b/libcudacxx/include/cuda/std/__mdspan/default_accessor.h @@ -72,10 +72,9 @@ struct default_accessor __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr default_accessor() noexcept = default; - __MDSPAN_TEMPLATE_REQUIRES(class _OtherElementType, - /* requires */ (_CCCL_TRAIT(is_convertible, _OtherElementType (*)[], element_type (*)[]))) - __MDSPAN_INLINE_FUNCTION - constexpr default_accessor(default_accessor<_OtherElementType>) noexcept {} + _LIBCUDACXX_TEMPLATE(class _OtherElementType) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _OtherElementType (*)[], element_type (*)[])) + __MDSPAN_INLINE_FUNCTION constexpr default_accessor(default_accessor<_OtherElementType>) noexcept {} __MDSPAN_INLINE_FUNCTION constexpr data_handle_type offset(data_handle_type __p, size_t __i) const noexcept diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h index 58ab181afc..c8177542da 100644 --- a/libcudacxx/include/cuda/std/__mdspan/extents.h +++ b/libcudacxx/include/cuda/std/__mdspan/extents.h @@ -248,16 +248,13 @@ class extents __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr extents() noexcept = default; // Converting constructor - __MDSPAN_TEMPLATE_REQUIRES( - class _OtherIndexType, - size_t... _OtherExtents, - /* requires */ - ( - /* multi-stage check to protect from invalid pack expansion when sizes don't match? */ - decltype(__detail::__check_compatible_extents( - integral_constant{}, - _CUDA_VSTD::integer_sequence{}, - _CUDA_VSTD::integer_sequence{}))::value)) + _LIBCUDACXX_TEMPLATE(class _OtherIndexType, size_t... _OtherExtents) + _LIBCUDACXX_REQUIRES( + /* multi-stage check to protect from invalid pack expansion when sizes don't match? */ + (decltype(__detail::__check_compatible_extents( + integral_constant{}, + _CUDA_VSTD::integer_sequence{}, + _CUDA_VSTD::integer_sequence{}))::value)) __MDSPAN_INLINE_FUNCTION __MDSPAN_CONDITIONAL_EXPLICIT( (((_Extents != dynamic_extent) && (_OtherExtents == dynamic_extent)) || ...) @@ -287,23 +284,23 @@ class extents } # ifdef __NVCC__ - __MDSPAN_TEMPLATE_REQUIRES( - class... _Integral, - /* requires */ ( - // TODO: check whether the other version works with newest NVCC, doesn't with 11.4 - // NVCC seems to pick up rank_dynamic from the wrong extents type??? - __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */) - && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) && - // NVCC chokes on the fold thingy here so wrote the workaround - ((sizeof...(_Integral) == __detail::__count_dynamic_extents<_Extents...>::val) - || (sizeof...(_Integral) == sizeof...(_Extents))))) + _LIBCUDACXX_TEMPLATE(class... _Integral) + _LIBCUDACXX_REQUIRES( + // TODO: check whether the other version works with newest NVCC, doesn't with 11.4 + // NVCC seems to pick up rank_dynamic from the wrong extents type??? + __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND( + _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) _LIBCUDACXX_AND + // NVCC chokes on the fold thingy here so wrote the workaround + ((sizeof...(_Integral) == __detail::__count_dynamic_extents<_Extents...>::val) + || (sizeof...(_Integral) == sizeof...(_Extents)))) # else - __MDSPAN_TEMPLATE_REQUIRES( - class... _Integral, - /* requires */ ( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */) - && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) - && ((sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank())))) + _LIBCUDACXX_TEMPLATE(class... _Integral) + _LIBCUDACXX_REQUIRES( + __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND( + _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) + _LIBCUDACXX_AND((sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank()))) # endif __MDSPAN_INLINE_FUNCTION explicit constexpr extents(_Integral... __exts) noexcept @@ -337,21 +334,16 @@ class extents # ifdef __NVCC__ // NVCC seems to pick up rank_dynamic from the wrong extents type??? // NVCC chokes on the fold thingy here so wrote the workaround - __MDSPAN_TEMPLATE_REQUIRES( - class _IndexType, - size_t _Np, - /* requires */ - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) - && ((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents))))) + _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np) + _LIBCUDACXX_REQUIRES( + _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) + _LIBCUDACXX_AND((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents)))) # else - __MDSPAN_TEMPLATE_REQUIRES( - class _IndexType, - size_t _Np, - /* requires */ - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) - && (_Np == rank() || _Np == rank_dynamic()))) + _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) + _LIBCUDACXX_AND(_Np == rank() || _Np == rank_dynamic())) # endif __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic()) __MDSPAN_INLINE_FUNCTION @@ -386,21 +378,16 @@ class extents # ifdef __NVCC__ // NVCC seems to pick up rank_dynamic from the wrong extents type??? // NVCC chokes on the fold thingy here so wrote the workaround - __MDSPAN_TEMPLATE_REQUIRES( - class _IndexType, - size_t _Np, - /* requires */ - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) - && ((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents))))) + _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np) + _LIBCUDACXX_REQUIRES( + _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) + _LIBCUDACXX_AND((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents)))) # else - __MDSPAN_TEMPLATE_REQUIRES( - class _IndexType, - size_t _Np, - /* requires */ - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) - && (_Np == rank() || _Np == rank_dynamic()))) + _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType) + _LIBCUDACXX_AND(_Np == rank() || _Np == rank_dynamic())) # endif __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic()) __MDSPAN_INLINE_FUNCTION diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_left.h b/libcudacxx/include/cuda/std/__mdspan/layout_left.h index b0cdde455c..9d0842515c 100644 --- a/libcudacxx/include/cuda/std/__mdspan/layout_left.h +++ b/libcudacxx/include/cuda/std/__mdspan/layout_left.h @@ -121,8 +121,8 @@ class layout_left::mapping : __extents(__exts) {} - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))) + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)) __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due // to comma __MDSPAN_INLINE_FUNCTION constexpr mapping( @@ -135,9 +135,9 @@ class layout_left::mapping */ } - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents) - && (extents_type::rank() <= 1))) + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents) + _LIBCUDACXX_AND(extents_type::rank() <= 1)) __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due // to comma __MDSPAN_INLINE_FUNCTION constexpr mapping( @@ -150,8 +150,8 @@ class layout_left::mapping */ } - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))) + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)) __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) __MDSPAN_INLINE_FUNCTION constexpr mapping( layout_stride::mapping<_OtherExtents> const& __other) // NOLINT(google-explicit-constructor) @@ -190,11 +190,10 @@ class layout_left::mapping //-------------------------------------------------------------------------------- - __MDSPAN_TEMPLATE_REQUIRES( - class... _Indices, - /* requires */ ((sizeof...(_Indices) == extents_type::rank()) - && __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices))))) + _LIBCUDACXX_TEMPLATE(class... _Indices) + _LIBCUDACXX_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _LIBCUDACXX_AND __MDSPAN_FOLD_AND( + (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) + && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)))) _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept { // Immediately cast incoming indices to `index_type` @@ -227,8 +226,8 @@ class layout_left::mapping return true; } - __MDSPAN_TEMPLATE_REQUIRES(class _Ext = _Extents, - /* requires */ (_Ext::rank() > 0)) + _LIBCUDACXX_TEMPLATE(class _Ext = _Extents) + _LIBCUDACXX_REQUIRES((_Ext::rank() > 0)) __MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type __i) const noexcept { diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_right.h b/libcudacxx/include/cuda/std/__mdspan/layout_right.h index efe215f114..4dfd4a1e38 100644 --- a/libcudacxx/include/cuda/std/__mdspan/layout_right.h +++ b/libcudacxx/include/cuda/std/__mdspan/layout_right.h @@ -126,8 +126,8 @@ class layout_right::mapping : __extents(__exts) {} - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))) + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)) __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due // to comma __MDSPAN_INLINE_FUNCTION constexpr mapping( @@ -140,9 +140,9 @@ class layout_right::mapping */ } - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents) - && (extents_type::rank() <= 1))) + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents) + _LIBCUDACXX_AND(extents_type::rank() <= 1)) __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due // to comma __MDSPAN_INLINE_FUNCTION constexpr mapping( @@ -155,8 +155,8 @@ class layout_right::mapping */ } - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))) + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)) __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) __MDSPAN_INLINE_FUNCTION constexpr mapping( layout_stride::mapping<_OtherExtents> const& __other) // NOLINT(google-explicit-constructor) @@ -195,11 +195,10 @@ class layout_right::mapping //-------------------------------------------------------------------------------- - __MDSPAN_TEMPLATE_REQUIRES( - class... _Indices, - /* requires */ ((sizeof...(_Indices) == extents_type::rank()) - && __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices))))) + _LIBCUDACXX_TEMPLATE(class... _Indices) + _LIBCUDACXX_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _LIBCUDACXX_AND __MDSPAN_FOLD_AND( + (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) + && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)))) _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept { return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast(__idxs)...); @@ -230,8 +229,8 @@ class layout_right::mapping return true; } - __MDSPAN_TEMPLATE_REQUIRES(class _Ext = _Extents, - /* requires */ (_Ext::rank() > 0)) + _LIBCUDACXX_TEMPLATE(class _Ext = _Extents) + _LIBCUDACXX_REQUIRES((_Ext::rank() > 0)) __MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type __i) const noexcept { diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h index 1818adff8a..d0a1ecad4b 100644 --- a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h +++ b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h @@ -93,8 +93,8 @@ struct layout_right namespace __detail { template -constexpr bool __is_mapping_of = - _CUDA_VSTD::is_same, _Mapping>::value; +_LIBCUDACXX_INLINE_VAR constexpr bool __is_mapping_of = + is_same, _Mapping>::value; # if __MDSPAN_USE_CONCEPTS && __MDSPAN_HAS_CXX_20 template @@ -298,17 +298,13 @@ struct layout_stride __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default; __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default; - __MDSPAN_TEMPLATE_REQUIRES( + // nvcc cannot deduce this constructor when using _LIBCUDACXX_REQUIRES + template < class _IntegralTypes, - /* requires */ ( - // MSVC 19.32 does not like using index_type here, requires the typename _Extents::index_type - // error C2641: cannot deduce template arguments for '_CUDA_VSTD::layout_stride::mapping' - _CCCL_TRAIT(_CUDA_VSTD::is_convertible, const remove_const_t<_IntegralTypes>&, typename _Extents::index_type) - && _CCCL_TRAIT( - _CUDA_VSTD::is_nothrow_constructible, typename _Extents::index_type, const remove_const_t<_IntegralTypes>&))) - __MDSPAN_INLINE_FUNCTION - constexpr mapping(extents_type const& __e, - _CUDA_VSTD::array<_IntegralTypes, extents_type::rank()> const& __s) noexcept + enable_if_t<_CCCL_TRAIT(is_convertible, const remove_const_t<_IntegralTypes>&, index_type), int> = 0, + enable_if_t<_CCCL_TRAIT(is_nothrow_constructible, index_type, const remove_const_t<_IntegralTypes>&), int> = 0> + __MDSPAN_INLINE_FUNCTION constexpr mapping( + extents_type const& __e, _CUDA_VSTD::array<_IntegralTypes, extents_type::rank()> const& __s) noexcept # ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS : __members{ # else @@ -331,17 +327,13 @@ struct layout_stride */ } - __MDSPAN_TEMPLATE_REQUIRES( + // nvcc cannot deduce this constructor when using _LIBCUDACXX_REQUIRES + template < class _IntegralTypes, - /* requires */ ( - // MSVC 19.32 does not like using index_type here, requires the typename _Extents::index_type - // error C2641: cannot deduce template arguments for '_CUDA_VSTD::layout_stride::mapping' - _CCCL_TRAIT(_CUDA_VSTD::is_convertible, const remove_const_t<_IntegralTypes>&, typename _Extents::index_type) - && _CCCL_TRAIT( - _CUDA_VSTD::is_nothrow_constructible, typename _Extents::index_type, const remove_const_t<_IntegralTypes>&))) - __MDSPAN_INLINE_FUNCTION - constexpr mapping(extents_type const& __e, - _CUDA_VSTD::span<_IntegralTypes, extents_type::rank()> const& __s) noexcept + enable_if_t<_CCCL_TRAIT(is_convertible, const remove_const_t<_IntegralTypes>&, index_type), int> = 0, + enable_if_t<_CCCL_TRAIT(is_nothrow_constructible, index_type, const remove_const_t<_IntegralTypes>&), int> = 0> + __MDSPAN_INLINE_FUNCTION constexpr mapping( + extents_type const& __e, _CUDA_VSTD::span<_IntegralTypes, extents_type::rank()> const& __s) noexcept # ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS : __members{ # else @@ -365,25 +357,25 @@ struct layout_stride } # if !(__MDSPAN_USE_CONCEPTS && __MDSPAN_HAS_CXX_20) - __MDSPAN_TEMPLATE_REQUIRES( - class _StridedLayoutMapping, - /* requires */ ( - _CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, typename _StridedLayoutMapping::extents_type) - && __detail::__is_mapping_of - && _StridedLayoutMapping::is_always_unique() && _StridedLayoutMapping::is_always_strided())) + _LIBCUDACXX_TEMPLATE(class _StridedLayoutMapping) + _LIBCUDACXX_REQUIRES( + _CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, typename _StridedLayoutMapping::extents_type) + _LIBCUDACXX_AND __detail::__is_mapping_of + _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_unique()) + _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided())) # else template requires(__detail::__layout_mapping_alike<_StridedLayoutMapping> - && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, typename _StridedLayoutMapping::extents_type) + && _CCCL_TRAIT(is_constructible, extents_type, typename _StridedLayoutMapping::extents_type) && _StridedLayoutMapping::is_always_unique() && _StridedLayoutMapping::is_always_strided()) # endif __MDSPAN_CONDITIONAL_EXPLICIT( - (!_CUDA_VSTD::is_convertible::value) + (!is_convertible::value) && (__detail::__is_mapping_of || __detail::__is_mapping_of || __detail::__is_mapping_of) ) // needs two () due to comma - __MDSPAN_INLINE_FUNCTION constexpr mapping( - _StridedLayoutMapping const& __other) noexcept // NOLINT(google-explicit-constructor) + __MDSPAN_INLINE_FUNCTION + constexpr mapping(_StridedLayoutMapping const& __other) noexcept // NOLINT(google-explicit-constructor) # ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS : __members{ # else @@ -440,12 +432,11 @@ struct layout_stride return __span_size; } - __MDSPAN_TEMPLATE_REQUIRES( - class... _Indices, - /* requires */ ( - sizeof...(_Indices) == _Extents::rank() - && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) /*&& ...*/) - && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices) /*&& ...*/))) + _LIBCUDACXX_TEMPLATE(class... _Indices) + _LIBCUDACXX_REQUIRES( + (sizeof...(_Indices) == _Extents::rank()) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _Indices, index_type) /*&& ...*/) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _Indices) /*&& ...*/)) __MDSPAN_FORCE_INLINE_FUNCTION constexpr index_type operator()(_Indices... __idxs) const noexcept { @@ -480,8 +471,8 @@ struct layout_stride return true; } - __MDSPAN_TEMPLATE_REQUIRES(class _Ext = _Extents, - /* requires */ (_Ext::rank() > 0)) + _LIBCUDACXX_TEMPLATE(class _Ext = _Extents) + _LIBCUDACXX_REQUIRES((_Ext::rank() > 0)) __MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type __r) const noexcept { @@ -489,11 +480,11 @@ struct layout_stride } # if !(__MDSPAN_USE_CONCEPTS && __MDSPAN_HAS_CXX_20) - __MDSPAN_TEMPLATE_REQUIRES( - class _StridedLayoutMapping, - /* requires */ (__detail::__is_mapping_of - && (extents_type::rank() == _StridedLayoutMapping::extents_type::rank()) - && _StridedLayoutMapping::is_always_strided())) + _LIBCUDACXX_TEMPLATE(class _StridedLayoutMapping) + _LIBCUDACXX_REQUIRES( + __detail::__is_mapping_of _LIBCUDACXX_AND( + extents_type::rank() == _StridedLayoutMapping::extents_type::rank()) + _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided())) # else template requires( @@ -515,30 +506,30 @@ struct layout_stride } // This one is not technically part of the proposal. Just here to make implementation a bit more optimal hopefully - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ ((extents_type::rank() == _OtherExtents::rank()))) - __MDSPAN_INLINE_FUNCTION - friend constexpr bool operator==(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES((extents_type::rank() == _OtherExtents::rank())) + __MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept { return __impl::_eq_impl(__lhs, __rhs); } # if !__MDSPAN_HAS_CXX_20 - __MDSPAN_TEMPLATE_REQUIRES( - class _StridedLayoutMapping, - /* requires */ (__detail::__is_mapping_of - && (extents_type::rank() == _StridedLayoutMapping::extents_type::rank()) - && _StridedLayoutMapping::is_always_strided())) - __MDSPAN_INLINE_FUNCTION - friend constexpr bool operator!=(const mapping& __x, const _StridedLayoutMapping& __y) noexcept + _LIBCUDACXX_TEMPLATE(class _StridedLayoutMapping) + _LIBCUDACXX_REQUIRES( + __detail::__is_mapping_of _LIBCUDACXX_AND( + extents_type::rank() == _StridedLayoutMapping::extents_type::rank()) + _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided())) + __MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping& __x, const _StridedLayoutMapping& __y) noexcept { return not(__x == __y); } - __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents, - /* requires */ ((extents_type::rank() == _OtherExtents::rank()))) - __MDSPAN_INLINE_FUNCTION - friend constexpr bool operator!=(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept + _LIBCUDACXX_TEMPLATE(class _OtherExtents) + _LIBCUDACXX_REQUIRES((extents_type::rank() == _OtherExtents::rank())) + __MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept { return __impl::_not_eq_impl(__lhs, __rhs); } diff --git a/libcudacxx/include/cuda/std/__mdspan/macros.h b/libcudacxx/include/cuda/std/__mdspan/macros.h index 0eba30a718..0aa54e0330 100644 --- a/libcudacxx/include/cuda/std/__mdspan/macros.h +++ b/libcudacxx/include/cuda/std/__mdspan/macros.h @@ -54,6 +54,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -249,167 +250,6 @@ // end Preprocessor helpers }}}1 //============================================================================== -//============================================================================== -// {{{1 - -// These compatibility macros don't help with partial ordering, but they should do the trick -// for what we need to do with concepts in mdspan -# ifdef __MDSPAN_USE_CONCEPTS -# define __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) \ - > \ - requires REQ -# define __MDSPAN_FUNCTION_REQUIRES(PAREN_PREQUALS, FNAME, PAREN_PARAMS, QUALS, REQ) \ - __MDSPAN_PP_REMOVE_PARENS(PAREN_PREQUALS) \ - FNAME PAREN_PARAMS QUALS \ - requires REQ /**/ -# else -# define __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) , typename _CUDA_VSTD::enable_if<(REQ), int>::type = 0 > -# define __MDSPAN_FUNCTION_REQUIRES(PAREN_PREQUALS, FNAME, PAREN_PARAMS, QUALS, REQ) \ - __MDSPAN_TEMPLATE_REQUIRES( \ - class __function_requires_ignored = void, (_CUDA_VSTD::is_void<__function_requires_ignored>::value && REQ)) \ - __MDSPAN_PP_REMOVE_PARENS(PAREN_PREQUALS) FNAME PAREN_PARAMS QUALS /**/ -# endif - -# if defined(__MDSPAN_COMPILER_MSVC) -# define __MDSPAN_TEMPLATE_REQUIRES(...) \ - __MDSPAN_PP_CAT(__MDSPAN_PP_CAT(__MDSPAN_TEMPLATE_REQUIRES_, __MDSPAN_PP_COUNT(__VA_ARGS__))(__VA_ARGS__), ) \ - /**/ -# else -# define __MDSPAN_TEMPLATE_REQUIRES(...) \ - __MDSPAN_PP_EVAL(__MDSPAN_PP_CAT(__MDSPAN_TEMPLATE_REQUIRES_, __MDSPAN_PP_COUNT(__VA_ARGS__)), __VA_ARGS__) \ - /**/ -# endif - -# define __MDSPAN_TEMPLATE_REQUIRES_2(TP1, REQ) template end Concept emulation }}}1 -//============================================================================== - //============================================================================== // {{{1 diff --git a/libcudacxx/include/cuda/std/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/__mdspan/mdspan.h index 1103663025..27e6a57a94 100644 --- a/libcudacxx/include/cuda/std/__mdspan/mdspan.h +++ b/libcudacxx/include/cuda/std/__mdspan/mdspan.h @@ -168,22 +168,21 @@ class mdspan __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan() requires( // Directly using rank_dynamic()>0 here doesn't work for nvcc - (extents_type::rank_dynamic() > 0) && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, data_handle_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, mapping_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type)) + (extents_type::rank_dynamic() > 0) && _CCCL_TRAIT(is_default_constructible, data_handle_type) + && _CCCL_TRAIT(is_default_constructible, mapping_type) + && _CCCL_TRAIT(is_default_constructible, accessor_type)) = default; # endif __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan(const mdspan&) = default; __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan(mdspan&&) = default; - __MDSPAN_TEMPLATE_REQUIRES( - class... _SizeTypes, - /* requires */ ( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeTypes, index_type) /* && ... */) - && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) - && ((sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic())) - && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type))) + _LIBCUDACXX_TEMPLATE(class... _SizeTypes) + _LIBCUDACXX_REQUIRES( + __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) + _LIBCUDACXX_AND((sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic())) + _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type) + _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type)) __MDSPAN_INLINE_FUNCTION explicit constexpr mdspan(data_handle_type __p, _SizeTypes... __dynamic_extents) // TODO @proposal-bug shouldn't I be allowed to do `move(__p)` here? @@ -193,15 +192,11 @@ class mdspan accessor_type())) {} - __MDSPAN_TEMPLATE_REQUIRES( - class _SizeType, - size_t _Np, - /* requires */ - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType) - && ((_Np == rank()) || (_Np == rank_dynamic())) - && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type))) + _LIBCUDACXX_TEMPLATE(class _SizeType, size_t _Np) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) _LIBCUDACXX_AND _CCCL_TRAIT( + is_nothrow_constructible, index_type, _SizeType) _LIBCUDACXX_AND((_Np == rank()) || (_Np == rank_dynamic())) + _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type) + _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type)) __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic()) __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, const _CUDA_VSTD::array<_SizeType, _Np>& __dynamic_extents) @@ -209,15 +204,11 @@ class mdspan __map_acc_pair_t(mapping_type(extents_type(__dynamic_extents)), accessor_type())) {} - __MDSPAN_TEMPLATE_REQUIRES( - class _SizeType, - size_t _Np, - /* requires */ - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType) - && ((_Np == rank()) || (_Np == rank_dynamic())) - && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type))) + _LIBCUDACXX_TEMPLATE(class _SizeType, size_t _Np) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) _LIBCUDACXX_AND _CCCL_TRAIT( + is_nothrow_constructible, index_type, _SizeType) _LIBCUDACXX_AND((_Np == rank()) || (_Np == rank_dynamic())) + _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type) + _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type)) __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic()) __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, _CUDA_VSTD::span<_SizeType, _Np> __dynamic_extents) @@ -225,23 +216,16 @@ class mdspan __map_acc_pair_t(mapping_type(extents_type(_CUDA_VSTD::as_const(__dynamic_extents))), accessor_type())) {} - __MDSPAN_FUNCTION_REQUIRES( - (__MDSPAN_INLINE_FUNCTION constexpr), - mdspan, - (data_handle_type __p, const extents_type& __exts), - , - /* requires */ - (_CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type))) + _LIBCUDACXX_TEMPLATE(bool _Is_default_constructible = _CCCL_TRAIT(is_default_constructible, accessor_type)) + _LIBCUDACXX_REQUIRES( + _Is_default_constructible _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)) + __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, const extents_type& __exts) : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(mapping_type(__exts), accessor_type())) {} - __MDSPAN_FUNCTION_REQUIRES( - (__MDSPAN_INLINE_FUNCTION constexpr), - mdspan, - (data_handle_type __p, const mapping_type& __m), - , - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type))) + _LIBCUDACXX_TEMPLATE(bool _Is_default_constructible = _CCCL_TRAIT(is_default_constructible, accessor_type)) + _LIBCUDACXX_REQUIRES(_Is_default_constructible) + __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, const mapping_type& __m) : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(__m, accessor_type())) {} @@ -250,23 +234,17 @@ class mdspan : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(__m, __a)) {} - __MDSPAN_TEMPLATE_REQUIRES( - class _OtherElementType, - class _OtherExtents, - class _OtherLayoutPolicy, - class _OtherAccessor, - /* requires */ - (_CCCL_TRAIT( - _CUDA_VSTD::is_constructible, mapping_type, typename _OtherLayoutPolicy::template mapping<_OtherExtents>) - && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, accessor_type, _OtherAccessor))) + _LIBCUDACXX_TEMPLATE(class _OtherElementType, class _OtherExtents, class _OtherLayoutPolicy, class _OtherAccessor) + _LIBCUDACXX_REQUIRES( + _CCCL_TRAIT(is_constructible, mapping_type, typename _OtherLayoutPolicy::template mapping<_OtherExtents>) + _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, accessor_type, _OtherAccessor)) __MDSPAN_INLINE_FUNCTION constexpr mdspan(const mdspan<_OtherElementType, _OtherExtents, _OtherLayoutPolicy, _OtherAccessor>& __other) : __members(__other.__ptr_ref(), __map_acc_pair_t(__other.__mapping_ref(), __other.__accessor_ref())) { - static_assert( - _CCCL_TRAIT(_CUDA_VSTD::is_constructible, data_handle_type, typename _OtherAccessor::data_handle_type), - "Incompatible data_handle_type for mdspan construction"); - static_assert(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents), + static_assert(_CCCL_TRAIT(is_constructible, data_handle_type, typename _OtherAccessor::data_handle_type), + "Incompatible data_handle_type for mdspan construction"); + static_assert(_CCCL_TRAIT(is_constructible, extents_type, _OtherExtents), "Incompatible extents for mdspan construction"); /* * TODO: Check precondition @@ -287,12 +265,11 @@ class mdspan // [mdspan.basic.mapping], mdspan mapping domain multidimensional index to access codomain element # if __MDSPAN_USE_BRACKET_OPERATOR - __MDSPAN_TEMPLATE_REQUIRES( - class... _SizeTypes, - /* requires */ ( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeTypes, index_type) /* && ... */) - && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) - && (rank() == sizeof...(_SizeTypes)))) + _LIBCUDACXX_TEMPLATE(class... _SizeTypes) + _LIBCUDACXX_REQUIRES( + __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) + _LIBCUDACXX_AND(rank() == sizeof...(_SizeTypes))) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator[](_SizeTypes... __indices) const { @@ -300,20 +277,18 @@ class mdspan } # endif - __MDSPAN_TEMPLATE_REQUIRES( - class _SizeType, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType))) + _LIBCUDACXX_TEMPLATE(class _SizeType) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType)) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator[](const _CUDA_VSTD::array<_SizeType, rank()>& __indices) const { return __impl::template __callop(*this, __indices); } - __MDSPAN_TEMPLATE_REQUIRES( - class _SizeType, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType))) + _LIBCUDACXX_TEMPLATE(class _SizeType) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType)) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator[](_CUDA_VSTD::span<_SizeType, rank()> __indices) const { @@ -321,10 +296,9 @@ class mdspan } # if !__MDSPAN_USE_BRACKET_OPERATOR - __MDSPAN_TEMPLATE_REQUIRES(class _Index, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Index, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Index) - && extents_type::rank() == 1)) + _LIBCUDACXX_TEMPLATE(class _Index) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _Index, index_type) _LIBCUDACXX_AND _CCCL_TRAIT( + is_nothrow_constructible, index_type, _Index) _LIBCUDACXX_AND(extents_type::rank() == 1)) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator[](_Index __idx) const { @@ -333,32 +307,29 @@ class mdspan # endif # if __MDSPAN_USE_PAREN_OPERATOR - __MDSPAN_TEMPLATE_REQUIRES( - class... _SizeTypes, - /* requires */ ( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeTypes, index_type) /* && ... */) - && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) - && extents_type::rank() == sizeof...(_SizeTypes))) + _LIBCUDACXX_TEMPLATE(class... _SizeTypes) + _LIBCUDACXX_REQUIRES( + __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) + _LIBCUDACXX_AND(extents_type::rank() == sizeof...(_SizeTypes))) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator()(_SizeTypes... __indices) const { return __accessor_ref().access(__ptr_ref(), __mapping_ref()(__indices...)); } - __MDSPAN_TEMPLATE_REQUIRES( - class _SizeType, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType))) + _LIBCUDACXX_TEMPLATE(class _SizeType) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType)) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator()(const _CUDA_VSTD::array<_SizeType, rank()>& __indices) const { return __impl::template __callop(*this, __indices); } - __MDSPAN_TEMPLATE_REQUIRES( - class _SizeType, - /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType))) + _LIBCUDACXX_TEMPLATE(class _SizeType) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) + _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType)) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator()(_CUDA_VSTD::span<_SizeType, rank()> __indices) const { @@ -470,17 +441,18 @@ class mdspan }; # if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) -__MDSPAN_TEMPLATE_REQUIRES( - class _ElementType, - class... _SizeTypes, - /* requires */ __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_integral, _SizeTypes) /* && ... */) && (sizeof...(_SizeTypes) > 0)) +_LIBCUDACXX_TEMPLATE(class _ElementType, class... _SizeTypes) +_LIBCUDACXX_REQUIRES(__MDSPAN_FOLD_AND(_CCCL_TRAIT(is_integral, _SizeTypes) /* && ... */) + _LIBCUDACXX_AND(sizeof...(_SizeTypes) > 0)) _CCCL_HOST_DEVICE explicit mdspan(_ElementType*, _SizeTypes...) -> mdspan<_ElementType, dextents>; -__MDSPAN_TEMPLATE_REQUIRES(class _Pointer, (_CCCL_TRAIT(is_pointer, _CUDA_VSTD::remove_reference_t<_Pointer>))) +_LIBCUDACXX_TEMPLATE(class _Pointer) +_LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_pointer, _CUDA_VSTD::remove_reference_t<_Pointer>)) _CCCL_HOST_DEVICE mdspan(_Pointer&&) -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents>; -__MDSPAN_TEMPLATE_REQUIRES(class _CArray, (_CCCL_TRAIT(is_array, _CArray) && (rank_v<_CArray> == 1))) +_LIBCUDACXX_TEMPLATE(class _CArray) +_LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_array, _CArray) _LIBCUDACXX_AND(rank_v<_CArray> == 1)) _CCCL_HOST_DEVICE mdspan(_CArray&) -> mdspan<_CUDA_VSTD::remove_all_extents_t<_CArray>, extents>>; diff --git a/libcudacxx/include/cuda/std/__mdspan/static_array.h b/libcudacxx/include/cuda/std/__mdspan/static_array.h index de511fe2e6..886f782065 100644 --- a/libcudacxx/include/cuda/std/__mdspan/static_array.h +++ b/libcudacxx/include/cuda/std/__mdspan/static_array.h @@ -162,21 +162,17 @@ class __partially_static_array_impl< {} __MDSPAN_INLINE_FUNCTION constexpr explicit __partially_static_array_impl( - _CUDA_VSTD::array<_Tp, sizeof...(_Idxs)> const& __vals) noexcept + array<_Tp, sizeof...(_Idxs)> const& __vals) noexcept : __partially_static_array_impl(__construct_psa_from_all_exts_values_tag, _CUDA_VSTD::get<_Idxs>(__vals)...) {} - // clang-format off - __MDSPAN_FUNCTION_REQUIRES( - (__MDSPAN_INLINE_FUNCTION constexpr explicit), - __partially_static_array_impl, - (_CUDA_VSTD::array<_Tp, __size_dynamic> const &__vals), noexcept, - /* requires */ - (sizeof...(_Idxs) != __size_dynamic) - ): __partially_static_array_impl( - __construct_psa_from_dynamic_exts_values_tag, - _CUDA_VSTD::get<_IdxsDynamicIdxs>(__vals)...) {} - // clang-format on + _LIBCUDACXX_TEMPLATE(bool _SizeMatches = (sizeof...(_Idxs) != __size_dynamic)) + _LIBCUDACXX_REQUIRES(_SizeMatches) + __MDSPAN_INLINE_FUNCTION constexpr explicit __partially_static_array_impl( + array<_Tp, __size_dynamic> const& __vals) noexcept + __partially_static_array_impl(__construct_psa_from_dynamic_exts_values_tag, + _CUDA_VSTD::get<_IdxsDynamicIdxs>(__vals)...) + {} template : true_type //============================================================================== -__MDSPAN_TEMPLATE_REQUIRES( - class _ET, - class _EXT, - class _LP, - class _AP, - class... _SliceSpecs, - /* requires */ - ((_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left) || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right) - || __detail::_is_layout_stride<_LP>::value) - && __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t) - || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple) - || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t)) /* && ... */) - && sizeof...(_SliceSpecs) == _EXT::rank())) +_LIBCUDACXX_TEMPLATE(class _ET, class _EXT, class _LP, class _AP, class... _SliceSpecs) +_LIBCUDACXX_REQUIRES( + (_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left) || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right) + || __detail::_is_layout_stride<_LP>::value) + _LIBCUDACXX_AND __MDSPAN_FOLD_AND( + (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t) + || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple) + || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t)) /* && ... */) + _LIBCUDACXX_AND(sizeof...(_SliceSpecs) == _EXT::rank())) __MDSPAN_INLINE_FUNCTION __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE( (constexpr submdspan(mdspan<_ET, _EXT, _LP, _AP> const& __src, _SliceSpecs... __slices) noexcept), diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp index 32e7c1cd84..fd84ddcc51 100644 --- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp +++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp @@ -93,8 +93,8 @@ class layout_foo::mapping : __extents(__exts) {} - __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents, - /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))) + _LIBCUDACXX_TEMPLATE(class OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)) __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible::value)) // needs two () due to // comma __MDSPAN_INLINE_FUNCTION constexpr mapping( @@ -107,18 +107,18 @@ class layout_foo::mapping */ } - __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents, - /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))) - __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible::value)) // needs two () due to - // comma + _LIBCUDACXX_TEMPLATE(class OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)) + __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible::value)) // needs two () due + // to comma __MDSPAN_INLINE_FUNCTION constexpr mapping( cuda::std::layout_right::mapping const& other) noexcept // NOLINT(google-explicit-constructor) : __extents(other.extents()) {} - __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents, - /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents) - && (extents_type::rank() <= 1))) + _LIBCUDACXX_TEMPLATE(class OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents) + && (extents_type::rank() <= 1)) __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible::value)) // needs two () due to // comma __MDSPAN_INLINE_FUNCTION constexpr mapping( @@ -126,8 +126,8 @@ class layout_foo::mapping : __extents(other.extents()) {} - __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents, - /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))) + _LIBCUDACXX_TEMPLATE(class OtherExtents) + _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)) __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) __MDSPAN_INLINE_FUNCTION constexpr mapping( cuda::std::layout_stride::mapping const& other) // NOLINT(google-explicit-constructor) From 74739348a20a0efd4189d5545643192cdc1830d6 Mon Sep 17 00:00:00 2001 From: Bryan Van de Ven Date: Fri, 9 Aug 2024 08:36:41 -0700 Subject: [PATCH 14/33] Un-doc functions taking debug_synchronous (#2209) * undoc functions taking debug_synchronous --- cub/cub/device/device_adjacent_difference.cuh | 8 +++++ cub/cub/device/device_histogram.cuh | 16 ++++++++++ cub/cub/device/device_merge_sort.cuh | 12 +++++++ cub/cub/device/device_partition.cuh | 6 ++++ cub/cub/device/device_reduce.cuh | 14 ++++++++ cub/cub/device/device_run_length_encode.cuh | 4 +++ cub/cub/device/device_scan.cuh | 28 ++++++++++++++++ .../device/device_segmented_radix_sort.cuh | 16 ++++++++++ cub/cub/device/device_segmented_reduce.cuh | 12 +++++++ cub/cub/device/device_segmented_sort.cuh | 32 +++++++++++++++++++ cub/cub/device/device_select.cuh | 12 +++++++ cub/cub/device/device_spmv.cuh | 2 ++ .../dispatch/dispatch_adjacent_difference.cuh | 8 +++-- .../device/dispatch/dispatch_histogram.cuh | 20 ++++++++---- cub/cub/device/dispatch/dispatch_reduce.cuh | 16 +++++++--- .../dispatch/dispatch_reduce_by_key.cuh | 6 ++-- cub/cub/device/dispatch/dispatch_rle.cuh | 6 ++-- cub/cub/device/dispatch/dispatch_scan.cuh | 8 +++-- .../device/dispatch/dispatch_scan_by_key.cuh | 8 +++-- .../dispatch/dispatch_segmented_sort.cuh | 10 ++++-- .../device/dispatch/dispatch_select_if.cuh | 6 ++-- .../device/dispatch/dispatch_spmv_orig.cuh | 14 +++++--- .../dispatch/dispatch_three_way_partition.cuh | 6 ++-- .../dispatch/dispatch_unique_by_key.cuh | 8 +++-- 24 files changed, 244 insertions(+), 34 deletions(-) diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index 750f7a974c..53d8cc2cfb 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -267,6 +267,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy( void* d_temp_storage, @@ -282,6 +283,7 @@ public: return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory. @@ -394,6 +396,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft( void* d_temp_storage, @@ -408,6 +411,7 @@ public: return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. @@ -539,6 +543,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy( void* d_temp_storage, @@ -554,6 +559,7 @@ public: return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. @@ -655,6 +661,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight( void* d_temp_storage, @@ -669,6 +676,7 @@ public: return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 989342d0a7..46f4bee557 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -206,6 +206,7 @@ struct DeviceHistogram stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, @@ -232,6 +233,7 @@ struct DeviceHistogram num_samples, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes an intensity histogram from a sequence of data samples using equal-width bins. @@ -384,6 +386,7 @@ struct DeviceHistogram stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, @@ -414,6 +417,7 @@ struct DeviceHistogram row_stride_bytes, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using @@ -584,6 +588,7 @@ struct DeviceHistogram stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, @@ -1008,6 +1017,7 @@ struct DeviceHistogram return HistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. @@ -1147,6 +1157,7 @@ struct DeviceHistogram stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, @@ -1175,6 +1186,7 @@ struct DeviceHistogram row_stride_bytes, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples @@ -1334,6 +1346,7 @@ struct DeviceHistogram stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -262,6 +263,7 @@ public: return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * @brief Sorts items using a merge sorting method. @@ -409,6 +411,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -581,6 +586,7 @@ public: return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -723,6 +729,7 @@ public: d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy( void* d_temp_storage, @@ -739,6 +746,7 @@ public: return SortKeysCopy( d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * @brief Sorts items using a merge sorting method. @@ -849,6 +857,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, @@ -865,6 +874,7 @@ public: return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * @brief Sorts items using a merge sorting method. @@ -966,6 +976,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, @@ -981,6 +992,7 @@ public: return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * @brief Sorts items using a merge sorting method. diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 5c24a0ec20..08a2ae531f 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -206,6 +206,7 @@ struct DevicePartition stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, @@ -223,6 +224,7 @@ struct DevicePartition return Flagged( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into @@ -367,6 +369,7 @@ struct DevicePartition stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, @@ -384,6 +387,7 @@ struct DevicePartition return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: template (num_items), reduction_op, init, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce( void* d_temp_storage, @@ -222,6 +223,7 @@ struct DeviceReduce return Reduce( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide sum using the addition (``+``) operator. @@ -328,6 +330,7 @@ struct DeviceReduce stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, @@ -342,6 +345,7 @@ struct DeviceReduce return Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide minimum using the less-than (``<``) operator. @@ -452,6 +456,7 @@ struct DeviceReduce stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, @@ -466,6 +471,7 @@ struct DeviceReduce return Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item. @@ -585,6 +591,7 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, @@ -599,6 +606,7 @@ struct DeviceReduce return ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide maximum using the greater-than (``>``) operator. @@ -707,6 +715,7 @@ struct DeviceReduce stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, @@ -721,6 +730,7 @@ struct DeviceReduce return Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Finds the first device-wide maximum using the greater-than (``>``) @@ -844,6 +854,7 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, @@ -858,6 +869,7 @@ struct DeviceReduce return ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Fuses transform and reduce operations @@ -1183,6 +1195,7 @@ struct DeviceReduce stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Enumerates the starting offsets and lengths of all non-trivial runs @@ -382,6 +384,7 @@ struct DeviceRunLengthEncode stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index c8a36f0255..29f3cf6c1e 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -194,6 +194,7 @@ struct DeviceScan d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), detail::InputValue(init_value), num_items, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, @@ -209,6 +210,7 @@ struct DeviceScan return ExclusiveSum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide exclusive prefix sum in-place. @@ -283,6 +285,7 @@ struct DeviceScan return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, @@ -296,6 +299,7 @@ struct DeviceScan return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide exclusive prefix scan using the specified @@ -426,6 +430,7 @@ struct DeviceScan stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, @@ -443,6 +448,7 @@ struct DeviceScan return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide exclusive prefix scan using the specified @@ -550,6 +556,7 @@ struct DeviceScan return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, @@ -566,6 +573,7 @@ struct DeviceScan return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide exclusive prefix scan using the specified @@ -704,6 +712,7 @@ struct DeviceScan stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor. @@ -835,6 +845,7 @@ struct DeviceScan return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, @@ -851,6 +862,7 @@ struct DeviceScan return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @} end member group //! @name Inclusive scans @@ -949,6 +961,7 @@ struct DeviceScan d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, @@ -964,6 +977,7 @@ struct DeviceScan return InclusiveSum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide inclusive prefix sum in-place. @@ -1037,6 +1051,7 @@ struct DeviceScan return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, @@ -1050,6 +1065,7 @@ struct DeviceScan return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. @@ -1266,6 +1282,7 @@ struct DeviceScan stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, @@ -1282,6 +1299,7 @@ struct DeviceScan return InclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. @@ -1379,6 +1397,7 @@ struct DeviceScan return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, @@ -1393,6 +1412,7 @@ struct DeviceScan return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide exclusive prefix sum-by-key with key equality @@ -1530,6 +1550,7 @@ struct DeviceScan stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template ( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide exclusive prefix scan-by-key using the @@ -1729,6 +1751,7 @@ struct DeviceScan stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template ( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide inclusive prefix scan-by-key using the @@ -2081,6 +2107,7 @@ struct DeviceScan stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template ( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @} end member group }; diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index cff9c22bce..eb6eecdcf3 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -264,6 +264,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -298,6 +299,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required) @@ -473,6 +475,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -503,6 +506,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required). @@ -678,6 +682,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -712,6 +717,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required). @@ -891,6 +897,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -921,6 +928,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @} end member group //! @name Keys-only @@ -1083,6 +1091,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -1113,6 +1122,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required). @@ -1280,6 +1290,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -1308,6 +1319,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required). @@ -1466,6 +1478,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -1496,6 +1509,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required). @@ -1661,6 +1675,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -1689,6 +1704,7 @@ public: end_bit, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @} end member group }; diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 90a1729685..6a0875734e 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -272,6 +272,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, @@ -436,6 +439,7 @@ public: return Sum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide segmented minimum using the less-than (``<``) operator. @@ -558,6 +562,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, @@ -575,6 +580,7 @@ public: return Min( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Finds the first device-wide minimum in each segment using the @@ -726,6 +732,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, @@ -743,6 +750,7 @@ public: return ArgMin( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Computes a device-wide segmented maximum using the greater-than (``>``) operator. @@ -859,6 +867,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, @@ -876,6 +885,7 @@ public: return Max( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Finds the first device-wide maximum in each segment using the @@ -1030,6 +1040,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, @@ -1047,6 +1058,7 @@ public: return ArgMax( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 2aeb145c5d..67a22c5e54 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -306,6 +306,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -332,6 +333,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -501,6 +503,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -527,6 +530,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -698,6 +702,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -715,6 +720,7 @@ public: return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -887,6 +893,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -904,6 +911,7 @@ public: return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of keys into ascending order. Approximately @@ -1041,6 +1049,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, @@ -1067,6 +1076,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of keys into descending order. @@ -1204,6 +1214,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, @@ -1230,6 +1241,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of keys into ascending order. @@ -1369,6 +1381,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, @@ -1386,6 +1399,7 @@ public: return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of keys into descending order. @@ -1524,6 +1538,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, @@ -1541,6 +1556,7 @@ public: return StableSortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -1741,6 +1757,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -1771,6 +1788,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -1967,6 +1985,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -1997,6 +2016,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -2193,6 +2213,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -2219,6 +2240,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: // Internal version without NVTX range @@ -2414,6 +2436,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -2440,6 +2463,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of key-value pairs into ascending order. @@ -2599,6 +2623,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, @@ -2629,6 +2654,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of key-value pairs into descending order. @@ -2788,6 +2814,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, @@ -2818,6 +2845,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of key-value pairs into ascending order. @@ -2983,6 +3011,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, @@ -3009,6 +3038,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sorts segments of key-value pairs into descending order. @@ -3173,6 +3203,7 @@ public: stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, @@ -3199,6 +3230,7 @@ public: d_end_offsets, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @} end member group }; diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 3113d6ca82..703a912829 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -203,6 +203,7 @@ struct DeviceSelect stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, @@ -220,6 +221,7 @@ struct DeviceSelect return Flagged( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``. @@ -339,6 +341,7 @@ struct DeviceSelect stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, @@ -355,6 +358,7 @@ struct DeviceSelect return Flagged( d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``. @@ -494,6 +498,7 @@ struct DeviceSelect stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, @@ -511,6 +516,7 @@ struct DeviceSelect return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Uses the ``select_op`` functor to selectively compact items in ``d_data``. @@ -642,6 +648,7 @@ struct DeviceSelect stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, @@ -658,6 +665,7 @@ struct DeviceSelect return If( d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the @@ -1003,6 +1011,7 @@ struct DeviceSelect stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, @@ -1019,6 +1028,7 @@ struct DeviceSelect return Unique( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive @@ -1320,6 +1330,7 @@ struct DeviceSelect stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template ::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV( void* d_temp_storage, @@ -237,6 +238,7 @@ struct DeviceSpmv num_nonzeros, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS //! @} end member group }; diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index dd161cf934..d4ae6ecddd 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -169,6 +169,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy , stream(stream) {} +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference( void* d_temp_storage, @@ -189,6 +190,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } +#endif // DOXYGEN_SHOULD_SKIP_THIS /// Invocation template @@ -250,7 +252,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy init_grid_size, init_block_size, reinterpret_cast(stream)); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream) .doit(DeviceAdjacentDifferenceInitKernel, @@ -280,7 +282,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, reinterpret_cast(stream)); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) @@ -354,6 +356,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Dispatch( void* d_temp_storage, @@ -369,6 +372,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 05a82ec200..c623cda9a2 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -440,7 +440,7 @@ struct dispatch_histogram histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke histogram_init_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -464,7 +464,7 @@ struct dispatch_histogram (long long) stream, pixels_per_thread, histogram_sweep_sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke histogram_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream) @@ -657,9 +657,9 @@ public: ::cuda::std::is_same::value), // CommonT, // uint64_t> // -#else +#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv uint64_t -#endif +#endif // !CUB_IS_INT128_ENABLED >; // Alias template that excludes __[u]int128 from the integral types @@ -669,9 +669,9 @@ public: ::cuda::std::_If<::cuda::std::is_same::value&& ::cuda::std::is_same::value, ::cuda::std::false_type, ::cuda::std::is_integral>; -#else +#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv ::cuda::std::is_integral; -#endif +#endif // !CUB_IS_INT128_ENABLED union ScaleT { @@ -1036,6 +1036,7 @@ public: return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, @@ -1066,6 +1067,7 @@ public: stream, is_byte_sample); } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * Dispatch routine for HistogramRange, specialized for 8-bit sample types @@ -1200,6 +1202,7 @@ public: return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, @@ -1230,6 +1233,7 @@ public: stream, is_byte_sample); } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit @@ -1416,6 +1420,7 @@ public: return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, @@ -1448,6 +1453,7 @@ public: stream, is_byte_sample); } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * Dispatch routine for HistogramEven, specialized for 8-bit sample types @@ -1586,6 +1592,7 @@ public: return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1617,6 +1624,7 @@ public: stream, is_byte_sample); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 262bcc2623..3b3c0c903e 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -609,6 +609,7 @@ struct DispatchReduce : SelectedPolicy , transform_op(transform_op) {} +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce( void* d_temp_storage, @@ -633,6 +634,7 @@ struct DispatchReduce : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } +#endif // DOXYGEN_SHOULD_SKIP_THIS //--------------------------------------------------------------------------- // Small-problem (single tile) invocation @@ -673,7 +675,7 @@ struct DispatchReduce : SelectedPolicy ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke single_reduce_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -795,7 +797,7 @@ struct DispatchReduce : SelectedPolicy (long long) stream, ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, reduce_config.sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -823,7 +825,7 @@ struct DispatchReduce : SelectedPolicy ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke DeviceReduceSingleTileKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -977,6 +979,7 @@ struct DispatchReduce : SelectedPolicy return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -993,6 +996,7 @@ struct DispatchReduce : SelectedPolicy return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; /** @@ -1151,6 +1155,7 @@ struct DispatchSegmentedReduce : SelectedPolicy , ptx_version(ptx_version) {} +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce( void* d_temp_storage, @@ -1179,6 +1184,7 @@ struct DispatchSegmentedReduce : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } +#endif // DOXYGEN_SHOULD_SKIP_THIS //--------------------------------------------------------------------------- // Chained policy invocation @@ -1231,7 +1237,7 @@ struct DispatchSegmentedReduce : SelectedPolicy (long long) stream, ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, segmented_reduce_config.sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -1379,6 +1385,7 @@ struct DispatchSegmentedReduce : SelectedPolicy return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -1407,6 +1414,7 @@ struct DispatchSegmentedReduce : SelectedPolicy init, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 8d5c3fb699..07dd492a53 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -348,7 +348,7 @@ struct DispatchReduceByKey #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -405,7 +405,7 @@ struct DispatchReduceByKey (long long) stream, items_per_thread, reduce_by_key_sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke reduce_by_key_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) @@ -550,6 +550,7 @@ struct DispatchReduceByKey return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -580,6 +581,7 @@ struct DispatchReduceByKey num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 9c6c32a95c..917b5df37b 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -354,7 +354,7 @@ struct DeviceRleDispatch init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -415,7 +415,7 @@ struct DeviceRleDispatch (long long) stream, items_per_thread, device_rle_kernel_sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke device_rle_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) @@ -543,6 +543,7 @@ struct DeviceRleDispatch return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, @@ -568,6 +569,7 @@ struct DeviceRleDispatch num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index ac82b5cd2b..56c2be9611 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -330,6 +330,7 @@ struct DispatchScan : SelectedPolicy , ptx_version(ptx_version) {} +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan( void* d_temp_storage, @@ -354,6 +355,7 @@ struct DispatchScan : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } +#endif // DOXYGEN_SHOULD_SKIP_THIS template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) @@ -426,7 +428,7 @@ struct DispatchScan : SelectedPolicy #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -478,7 +480,7 @@ struct DispatchScan : SelectedPolicy (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) @@ -591,6 +593,7 @@ struct DispatchScan : SelectedPolicy return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -607,6 +610,7 @@ struct DispatchScan : SelectedPolicy return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index eac364d77e..032554773a 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -339,6 +339,7 @@ struct DispatchScanByKey : SelectedPolicy , ptx_version(ptx_version) {} +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey( void* d_temp_storage, @@ -367,6 +368,7 @@ struct DispatchScanByKey : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } +#endif // DOXYGEN_SHOULD_SKIP_THIS template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) @@ -436,7 +438,7 @@ struct DispatchScanByKey : SelectedPolicy int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -489,7 +491,7 @@ struct DispatchScanByKey : SelectedPolicy (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) @@ -631,6 +633,7 @@ struct DispatchScanByKey : SelectedPolicy return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -659,6 +662,7 @@ struct DispatchScanByKey : SelectedPolicy num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 84c81f34a9..702df00df3 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -548,7 +548,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont static_cast(blocks_in_grid), LargeSegmentPolicyT::BLOCK_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream) @@ -596,7 +596,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont static_cast(small_and_medium_blocks_in_grid), SmallAndMediumPolicyT::BLOCK_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream) @@ -1131,6 +1131,7 @@ struct DispatchSegmentedSort : SelectedPolicy , stream(stream) {} +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort( void* d_temp_storage, @@ -1157,6 +1158,7 @@ struct DispatchSegmentedSort : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } +#endif // DOXYGEN_SHOULD_SKIP_THIS template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() @@ -1438,6 +1440,7 @@ struct DispatchSegmentedSort : SelectedPolicy return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -1466,6 +1469,7 @@ struct DispatchSegmentedSort : SelectedPolicy is_overwrite_okay, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS private: CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits) @@ -1651,7 +1655,7 @@ private: (long long) stream, LargeSegmentPolicyT::ITEMS_PER_THREAD, LargeSegmentPolicyT::RADIX_BITS); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke fallback kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 24c25b3679..50a2022184 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -467,7 +467,7 @@ struct DispatchSelectIf : SelectedPolicy #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog( "Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke scan_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -530,7 +530,7 @@ struct DispatchSelectIf : SelectedPolicy items_per_thread, range_select_sm_occupancy); } -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) @@ -656,6 +656,7 @@ struct DispatchSelectIf : SelectedPolicy return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -684,6 +685,7 @@ struct DispatchSelectIf : SelectedPolicy num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index 95e51ebdba..0519dcc739 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -631,7 +631,7 @@ struct DispatchSpmv blocks_in_grid, threads_in_block, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(spmv_empty_matrix_kernel, spmv_params); @@ -668,7 +668,7 @@ struct DispatchSpmv degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -795,7 +795,7 @@ struct DispatchSpmv search_grid_size, search_block_size, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream) @@ -825,7 +825,7 @@ struct DispatchSpmv (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke spmv_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream) @@ -863,7 +863,7 @@ struct DispatchSpmv (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke segment_fixup_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -893,6 +893,7 @@ struct DispatchSpmv return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template (stream)); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke three_way_partition_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -345,7 +345,7 @@ struct DispatchThreeWayPartitionIf items_per_thread, range_select_sm_occupancy); } -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) @@ -452,6 +452,7 @@ struct DispatchThreeWayPartitionIf return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -482,6 +483,7 @@ struct DispatchThreeWayPartitionIf num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 1d097a93a0..c943034221 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -322,6 +322,7 @@ struct DispatchUniqueByKey : SelectedPolicy , stream(stream) {} +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey( void* d_temp_storage, @@ -348,6 +349,7 @@ struct DispatchUniqueByKey : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } +#endif // DOXYGEN_SHOULD_SKIP_THIS /****************************************************************************** * Dispatch entrypoints @@ -425,7 +427,7 @@ struct DispatchUniqueByKey : SelectedPolicy #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -488,7 +490,7 @@ struct DispatchUniqueByKey : SelectedPolicy items_per_thread, scan_sm_occupancy); } -#endif +#endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke select_if_kernel error = @@ -629,6 +631,7 @@ struct DispatchUniqueByKey : SelectedPolicy return error; } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -657,6 +660,7 @@ struct DispatchUniqueByKey : SelectedPolicy num_items, stream); } +#endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END From a3a5f9c227c63c8f328ed2c181f0e935cc713eda Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Fri, 9 Aug 2024 10:46:24 -0700 Subject: [PATCH 15/33] CUDA `vector_add` sample project (#2160) --------- Co-authored-by: pciolkosz Co-authored-by: Michael Schellenberger Costa --- cudax/CMakeLists.txt | 13 +- cudax/cmake/cudaxBuildCompilerTargets.cmake | 2 +- cudax/cmake/cudaxBuildTargetList.cmake | 1 + .../cuda/experimental/__detail/utility.cuh | 19 ++- .../cuda/experimental/__launch/param_kind.cuh | 85 ++++++++++ cudax/include/cuda/experimental/launch.cuh | 3 + cudax/samples/CMakeLists.txt | 76 +++++++++ cudax/samples/cmake/CPM.cmake | 33 ++++ cudax/samples/vector_add/param_kind.cuh | 85 ++++++++++ cudax/samples/vector_add/vector.cuh | 151 ++++++++++++++++++ cudax/samples/vector_add/vector_add.cu | 127 +++++++++++++++ 11 files changed, 589 insertions(+), 6 deletions(-) create mode 100644 cudax/include/cuda/experimental/__launch/param_kind.cuh create mode 100755 cudax/samples/CMakeLists.txt create mode 100755 cudax/samples/cmake/CPM.cmake create mode 100644 cudax/samples/vector_add/param_kind.cuh create mode 100644 cudax/samples/vector_add/vector.cuh create mode 100644 cudax/samples/vector_add/vector_add.cu diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt index 4886562aca..f875cf8ebf 100644 --- a/cudax/CMakeLists.txt +++ b/cudax/CMakeLists.txt @@ -11,7 +11,7 @@ if (cudax_TOPLEVEL_PROJECT) cmake_minimum_required(VERSION 3.21) endif() -project(cudax LANGUAGES CUDA) +project(cudax LANGUAGES CUDA CXX) option(cudax_ENABLE_INSTALL_RULES "Enable installation of CUDA Experimental." ${cudax_TOPLEVEL_PROJECT}) if (cudax_ENABLE_INSTALL_RULES) @@ -25,6 +25,7 @@ endif() option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON) option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON) +option(cudax_ENABLE_SAMPLES "Build CUDA Experimental's samples." ON) include(cmake/cudaxBuildCompilerTargets.cmake) include(cmake/cudaxBuildTargetList.cmake) @@ -41,3 +42,13 @@ if (cudax_ENABLE_TESTING) enable_testing() # Must be in root directory add_subdirectory(test) endif() + +if (cudax_ENABLE_SAMPLES) + include(ExternalProject) + ExternalProject_Add(cudax_samples + PREFIX samples + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/samples" + BUILD_ALWAYS ON + INSTALL_COMMAND cmake -E echo "Skipping install step.") + add_dependencies(cudax.all cudax_samples) +endif() diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake index 73aa9e376e..53cf7b8af4 100644 --- a/cudax/cmake/cudaxBuildCompilerTargets.cmake +++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake @@ -9,7 +9,7 @@ include("${cudax_SOURCE_DIR}/cmake/AppendOptionIfAvailable.cmake") function(cudax_build_compiler_targets) - set(cxx_compile_definitions) + set(cxx_compile_definitions LIBCUDACXX_ENABLE_EXCEPTIONS) set(cxx_compile_options) set(cuda_compile_options) diff --git a/cudax/cmake/cudaxBuildTargetList.cmake b/cudax/cmake/cudaxBuildTargetList.cmake index 63284dbe4a..2be17393dc 100644 --- a/cudax/cmake/cudaxBuildTargetList.cmake +++ b/cudax/cmake/cudaxBuildTargetList.cmake @@ -176,6 +176,7 @@ function(cudax_build_target_list) file(GLOB_RECURSE all_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${cudax_SOURCE_DIR}/include/cuda/experimental/*.hpp" + "${cudax_SOURCE_DIR}/include/cuda/experimental/*.cuh" ) add_custom_target(cudax.all SOURCES ${all_sources}) diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh index 738a5d6244..1263ea880f 100644 --- a/cudax/include/cuda/experimental/__detail/utility.cuh +++ b/cudax/include/cuda/experimental/__detail/utility.cuh @@ -25,12 +25,23 @@ namespace cuda::experimental { namespace detail { -struct __ignore +// This is a helper type that can be used to ignore function arguments. +struct [[maybe_unused]] __ignore { - template - _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept + __ignore() = default; + + template + _CCCL_HOST_DEVICE constexpr __ignore(_Arg&&) noexcept {} }; + +// Classes can inherit from this type to become immovable. +struct __immovable +{ + __immovable() = default; + __immovable(__immovable&&) = delete; + __immovable& operator=(__immovable&&) = delete; +}; } // namespace detail struct uninit_t @@ -38,7 +49,7 @@ struct uninit_t explicit uninit_t() = default; }; -inline constexpr uninit_t uninit{}; +_CCCL_GLOBAL_CONSTANT uninit_t uninit{}; } // namespace cuda::experimental #endif // __CUDAX_DETAIL_UTILITY_H diff --git a/cudax/include/cuda/experimental/__launch/param_kind.cuh b/cudax/include/cuda/experimental/__launch/param_kind.cuh new file mode 100644 index 0000000000..d50ebe49d3 --- /dev/null +++ b/cudax/include/cuda/experimental/__launch/param_kind.cuh @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__LAUNCH_PARAM_KIND +#define _CUDAX__LAUNCH_PARAM_KIND + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental +{ +namespace detail +{ +enum class __param_kind : unsigned +{ + _in = 1, + _out = 2, + _inout = 3 +}; + +_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept +{ + return __param_kind(unsigned(__a) & unsigned(__b)); +} + +template +struct _CCCL_NODISCARD __box +{ + ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val; +}; + +struct __in_t +{ + template + __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __out_t +{ + template + __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __inout_t +{ + template + __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +} // namespace detail + +_CCCL_GLOBAL_CONSTANT detail::__in_t in{}; +_CCCL_GLOBAL_CONSTANT detail::__out_t out{}; +_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{}; + +} // namespace cuda::experimental + +#endif // _CUDAX__LAUNCH_PARAM_KIND diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh index 69048248ef..0bac26aa01 100644 --- a/cudax/include/cuda/experimental/launch.cuh +++ b/cudax/include/cuda/experimental/launch.cuh @@ -11,6 +11,9 @@ #ifndef __CUDAX_LAUNCH___ #define __CUDAX_LAUNCH___ +#include #include +#include +#include #endif // __CUDAX_LAUNCH___ diff --git a/cudax/samples/CMakeLists.txt b/cudax/samples/CMakeLists.txt new file mode 100755 index 0000000000..df0985c1ad --- /dev/null +++ b/cudax/samples/CMakeLists.txt @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(CUDAX_SAMPLES CUDA CXX) + +# This example uses the CMake Package Manager (CPM) to simplify fetching CCCL from GitHub +# For more information, see https://github.com/cpm-cmake/CPM.cmake +include(cmake/CPM.cmake) + +# We define these as variables so they can be overriden in CI to pull from a PR instead of CCCL `main` +# In your project, these variables are unncessary and you can just use the values directly +set(CCCL_REPOSITORY "nvidia/cccl" CACHE STRING "GitHub repository to fetch CCCL from") +set(CCCL_TAG "main" CACHE STRING "Git tag/branch to fetch from CCCL repository") + +# This will automatically clone CCCL from GitHub and make the exported cmake targets available +CPMAddPackage( + NAME CCCL + GITHUB_REPOSITORY ${CCCL_REPOSITORY} + GIT_TAG ${CCCL_TAG} + GIT_SHALLOW ON + OPTIONS "CCCL_ENABLE_UNSTABLE ON" +) + +# Default to building for the GPU on the current system +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 86) +endif() + +# Creates a cmake executable target for the main program +add_executable(vector_add vector_add/vector_add.cu) + +# "Links" the CCCL::cudax CMake target to the `vector_add` executable. This +# configures everything needed to use CCCL's headers, including setting up +# include paths, compiler flags, etc. +target_link_libraries(vector_add + PUBLIC + CCCL::cudax + CCCL::CCCL + CCCL::Thrust + CCCL::libcudacxx + INTERFACE cudax.compiler_interface +) + +# TODO: These are temporary until the main branch catches up with the latest changes +target_compile_definitions(vector_add PUBLIC LIBCUDACXX_ENABLE_EXCEPTIONS) + +if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + # mdspan on windows only works in C++20 mode + target_compile_features(vector_add PUBLIC cxx_std_20) + + # cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require + # __cplusplus to be defined for this to work: + target_compile_options(vector_add PRIVATE + $<$:/Zc:__cplusplus /Zc:preprocessor> + $<$:-Xcompiler=/Zc:__cplusplus -Xcompiler=/Zc:preprocessor> + ) +endif() + +# This is only relevant for internal testing and not needed by end users. +include(CTest) +enable_testing() +add_test(NAME vector_add COMMAND vector_add) diff --git a/cudax/samples/cmake/CPM.cmake b/cudax/samples/cmake/CPM.cmake new file mode 100755 index 0000000000..a3086b791b --- /dev/null +++ b/cudax/samples/cmake/CPM.cmake @@ -0,0 +1,33 @@ +set(CPM_DOWNLOAD_VERSION 0.38.1) + +if(CPM_SOURCE_CACHE) + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else() + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif() + +# Expand relative path. This is important if the provided path contains a tilde (~) +get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) + +function(download_cpm) + message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") + file(DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} + ) +endfunction() + +if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) + download_cpm() +else() + # resume download if it previously failed + file(READ ${CPM_DOWNLOAD_LOCATION} check) + if("${check}" STREQUAL "") + download_cpm() + endif() + unset(check) +endif() + +include(${CPM_DOWNLOAD_LOCATION}) diff --git a/cudax/samples/vector_add/param_kind.cuh b/cudax/samples/vector_add/param_kind.cuh new file mode 100644 index 0000000000..d50ebe49d3 --- /dev/null +++ b/cudax/samples/vector_add/param_kind.cuh @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__LAUNCH_PARAM_KIND +#define _CUDAX__LAUNCH_PARAM_KIND + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental +{ +namespace detail +{ +enum class __param_kind : unsigned +{ + _in = 1, + _out = 2, + _inout = 3 +}; + +_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept +{ + return __param_kind(unsigned(__a) & unsigned(__b)); +} + +template +struct _CCCL_NODISCARD __box +{ + ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val; +}; + +struct __in_t +{ + template + __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __out_t +{ + template + __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __inout_t +{ + template + __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +} // namespace detail + +_CCCL_GLOBAL_CONSTANT detail::__in_t in{}; +_CCCL_GLOBAL_CONSTANT detail::__out_t out{}; +_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{}; + +} // namespace cuda::experimental + +#endif // _CUDAX__LAUNCH_PARAM_KIND diff --git a/cudax/samples/vector_add/vector.cuh b/cudax/samples/vector_add/vector.cuh new file mode 100644 index 0000000000..7eef87f038 --- /dev/null +++ b/cudax/samples/vector_add/vector.cuh @@ -0,0 +1,151 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__CONTAINER_VECTOR +#define _CUDAX__CONTAINER_VECTOR + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include + +#include + +#include "param_kind.cuh" + +#if _CCCL_STD_VER >= 2017 +namespace cuda::experimental +{ +using ::cuda::std::span; +using ::thrust::device_vector; +using ::thrust::host_vector; + +template +class vector +{ +public: + vector() = default; + explicit vector(size_t __n) + : __h_(__n) + {} + + _Ty& operator[](size_t __i) noexcept + { + __dirty_ = true; + return __h_[__i]; + } + + const _Ty& operator[](size_t __i) const noexcept + { + return __h_[__i]; + } + +private: + void sync_host_to_device(stream_ref __str, detail::__param_kind __p) const + { + if (__dirty_) + { + if (__p == detail::__param_kind::_out) + { + // There's no need to copy the data from host to device if the data is + // only going to be written to. We can just allocate the device memory. + __d_.resize(__h_.size()); + } + else + { + // TODO: use a memcpy async here + __d_ = __h_; + } + __dirty_ = false; + } + } + + void sync_device_to_host(stream_ref __str, detail::__param_kind __p) const + { + if (__p != detail::__param_kind::_in) + { + // TODO: use a memcpy async here + __str.wait(); // wait for the kernel to finish executing + __h_ = __d_; + } + } + + template + class __action //: private detail::__immovable + { + using __cv_vector = ::cuda::std::__maybe_const<_Kind == detail::__param_kind::_in, vector>; + + public: + explicit __action(stream_ref __str, __cv_vector& __v) noexcept + : __str_(__str) + , __v_(__v) + { + __v_.sync_host_to_device(__str_, _Kind); + } + + __action(__action&&) = delete; + + ~__action() + { + __v_.sync_device_to_host(__str_, _Kind); + } + + using __as_kernel_arg = ::cuda::std::span<_Ty>; + + operator ::cuda::std::span<_Ty>() + { + return {__v_.__d_.data().get(), __v_.__d_.size()}; + } + + private: + stream_ref __str_; + __cv_vector& __v_; + }; + + _CCCL_NODISCARD_FRIEND __action + __cudax_launch_transform(stream_ref __str, vector& __v) noexcept + { + return __action{__str, __v}; + } + + _CCCL_NODISCARD_FRIEND __action + __cudax_launch_transform(stream_ref __str, const vector& __v) noexcept + { + return __action{__str, __v}; + } + + template + _CCCL_NODISCARD_FRIEND __action<_Kind> + __cudax_launch_transform(stream_ref __str, detail::__box __b) noexcept + { + return __action<_Kind>{__str, __b.__val}; + } + + mutable host_vector<_Ty> __h_; + mutable device_vector<_Ty> __d_{}; + mutable bool __dirty_ = true; +}; + +} // namespace cuda::experimental + +#endif +#endif diff --git a/cudax/samples/vector_add/vector_add.cu b/cudax/samples/vector_add/vector_add.cu new file mode 100644 index 0000000000..784997e23d --- /dev/null +++ b/cudax/samples/vector_add/vector_add.cu @@ -0,0 +1,127 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Vector addition: C = A + B. + * + * This sample is a very basic sample that implements element by element + * vector addition. It is the same as the sample illustrating Chapter 2 + * of the programming guide with some additions like error checking. + */ + +#include + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +#include + +#include +#include + +#include "vector.cuh" + +namespace cudax = cuda::experimental; +using cudax::in; +using cudax::out; + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ +__global__ void vectorAdd(cudax::span A, cudax::span B, cudax::span C) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < A.size()) + { + C[i] = A[i] + B[i] + 0.0f; + } +} + +/** + * Host main routine + */ +int main(void) +try +{ + // A CUDA stream on which to execute the vector addition kernel + cudax::stream stream(cudax::devices[0]); + + // Print the vector length to be used, and compute its size + int numElements = 50000; + printf("[Vector addition of %d elements]\n", numElements); + + // Allocate the host vectors + cudax::vector A(numElements); // input + cudax::vector B(numElements); // input + cudax::vector C(numElements); // output + + // Initialize the host input vectors + for (int i = 0; i < numElements; ++i) + { + A[i] = rand() / (float) RAND_MAX; + B[i] = rand() / (float) RAND_MAX; + } + + // Define the kernel launch parameters + constexpr int threadsPerBlock = 256; + auto dims = cudax::distribute(numElements); + + // Launch the vectorAdd kernel + printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock); + cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C)); + + printf("waiting for the stream to finish\n"); + stream.wait(); + + printf("veryfying the results\n"); + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) + { + if (fabs(A[i] + B[i] - C[i]) > 1e-5) + { + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + + printf("Test PASSED\n"); + + printf("Done\n"); + return 0; +} +catch (const std::exception& e) +{ + printf("caught an exception: \"%s\"\n", e.what()); +} +catch (...) +{ + printf("caught an unknown exception\n"); +} From 6ee3415a8d0eea82d0d6f9915aa249a6ceb13e24 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Mon, 12 Aug 2024 10:23:40 -0700 Subject: [PATCH 16/33] avoid constraint recursion in the `resource` concept (#2215) drive-by: avoid potential overload ambiguity in `__launch_transform` --- .../__launch/launch_transform.cuh | 34 +++++++++---------- .../cuda_managed_memory_resource.h | 6 ++-- .../cuda_pinned_memory_resource.h | 6 ++-- .../include/cuda/__memory_resource/resource.h | 24 +++++++++++-- .../equality.pass.cpp | 7 ++++ .../cuda_memory_resource/equality.pass.cpp | 7 ++++ .../equality.pass.cpp | 7 ++++ 7 files changed, 66 insertions(+), 25 deletions(-) diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh index 4692cf9376..b131ccdfaa 100644 --- a/cudax/include/cuda/experimental/__launch/launch_transform.cuh +++ b/cudax/include/cuda/experimental/__launch/launch_transform.cuh @@ -32,17 +32,7 @@ namespace cuda::experimental namespace detail { // Types should define overloads of __cudax_launch_transform that are find-able -// by ADL in order to customize how cudax::launch handles that type. The -// overload below, which simply returns the argument unmodified, is the overload -// that gets chosen if no other overload matches. It takes __ignore as the first -// argument to make this overload less preferred than other overloads that take -// a stream_ref as the first argument. -template -_CCCL_NODISCARD constexpr _Arg&& __cudax_launch_transform(__ignore, _Arg&& __arg) noexcept -{ - return _CUDA_VSTD::forward<_Arg>(__arg); -} - +// by ADL in order to customize how cudax::launch handles that type. template using __launch_transform_direct_result_t = decltype(__cudax_launch_transform(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>())); @@ -50,25 +40,35 @@ using __launch_transform_direct_result_t = struct __fn { template - _CCCL_NODISCARD __launch_transform_direct_result_t<_Arg> operator()(::cuda::stream_ref __stream, _Arg&& __arg) const + _CCCL_NODISCARD decltype(auto) operator()(::cuda::stream_ref __stream, _Arg&& __arg) const { - // This call is unqualified to allow ADL - return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg)); + if constexpr (::cuda::std::_IsValidExpansion<__launch_transform_direct_result_t, _Arg>::value) + { + // This call is unqualified to allow ADL + return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg)); + } + else + { + return _CUDA_VSTD::forward<_Arg>(__arg); + } } }; +template +using __launch_transform_result_t = decltype(__fn{}(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>())); + template struct __as_kernel_arg { - using type = _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>; + using type = _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>; }; template struct __as_kernel_arg< _Arg, - _CUDA_VSTD::void_t>::__as_kernel_arg>> + _CUDA_VSTD::void_t>::__as_kernel_arg>> { - using type = typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg; + using type = typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg; }; _CCCL_GLOBAL_CONSTANT __fn __launch_transform{}; diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h index 75ba16bd05..a8a42841de 100644 --- a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h @@ -80,7 +80,7 @@ class cuda_managed_memory_resource { // We need to ensure that the provided alignment matches the minimal provided alignment _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment), - "Invalid alignment passed to cuda_memory_resource::deallocate."); + "Invalid alignment passed to cuda_managed_memory_resource::deallocate."); _CCCL_ASSERT_CUDA_API(::cudaFree, "cuda_managed_memory_resource::deallocate failed", __ptr); (void) __alignment; } @@ -102,8 +102,8 @@ class cuda_managed_memory_resource } # endif // _CCCL_STD_VER <= 2017 - //! @brief Equality comparison between a \c cuda_memory_resource and another resource - //! @param __lhs The \c cuda_memory_resource + //! @brief Equality comparison between a \c cuda_managed_memory_resource and another resource + //! @param __lhs The \c cuda_managed_memory_resource //! @param __rhs The resource to compare to //! @return If the underlying types are equality comparable, returns the result of equality comparison of both //! resources. Otherwise, returns false. diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h index ac7fd07b96..f8fc3a25ce 100644 --- a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h @@ -82,7 +82,7 @@ class cuda_pinned_memory_resource { // We need to ensure that the provided alignment matches the minimal provided alignment _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment), - "Invalid alignment passed to cuda_memory_resource::deallocate."); + "Invalid alignment passed to cuda_pinned_memory_resource::deallocate."); _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "cuda_pinned_memory_resource::deallocate failed", __ptr); (void) __alignment; } @@ -104,8 +104,8 @@ class cuda_pinned_memory_resource } # endif // _CCCL_STD_VER <= 2017 - //! @brief Equality comparison between a \c cuda_memory_resource and another resource - //! @param __lhs The \c cuda_memory_resource + //! @brief Equality comparison between a \c cuda_pinned_memory_resource and another resource + //! @param __lhs The \c cuda_pinned_memory_resource //! @param __rhs The resource to compare to //! @return If the underlying types are equality comparable, returns the result of equality comparison of both //! resources. Otherwise, returns false. diff --git a/libcudacxx/include/cuda/__memory_resource/resource.h b/libcudacxx/include/cuda/__memory_resource/resource.h index 8328d9809c..0692269b80 100644 --- a/libcudacxx/include/cuda/__memory_resource/resource.h +++ b/libcudacxx/include/cuda/__memory_resource/resource.h @@ -25,6 +25,7 @@ # include # include +# include # include # include # include @@ -99,10 +100,29 @@ template _LIBCUDACXX_CONCEPT async_resource_with = async_resource<_Resource> && _CUDA_VSTD::__all_of...>; +template +struct __different_resource__ +{ + template + static constexpr bool __value(_OtherResource*) noexcept + { + return resource<_OtherResource>; + } +}; + +template <> +struct __different_resource__ +{ + static constexpr bool __value(void*) noexcept + { + return false; + } +}; + template _LIBCUDACXX_CONCEPT __different_resource = - (!_CUDA_VSTD::same_as<_CUDA_VSTD::decay_t<_Resource>, _CUDA_VSTD::decay_t<_OtherResource>>) - && resource<_OtherResource>; + __different_resource__<_CUDA_VSTD::convertible_to<_OtherResource const&, _Resource const&>>::__value( + static_cast<_OtherResource*>(nullptr)); _LIBCUDACXX_END_NAMESPACE_CUDA_MR diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp index 80fb2ab57e..f2e14578f7 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp @@ -56,6 +56,13 @@ struct async_resource : public resource static_assert(cuda::mr::async_resource>, ""); static_assert(cuda::mr::async_resource>, ""); +// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 +struct derived_managed_resource : cuda::mr::cuda_managed_memory_resource +{ + using cuda::mr::cuda_managed_memory_resource::cuda_managed_memory_resource; +}; +static_assert(cuda::mr::resource, ""); + void test() { cuda::mr::cuda_managed_memory_resource first{}; diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp index 94d659f90f..50fd7476ba 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp @@ -66,6 +66,13 @@ static_assert(cuda::mr::async_resource static_assert(cuda::mr::async_resource_with, cuda::mr::device_accessible>, ""); +// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 +struct derived_resource : cuda::mr::cuda_memory_resource +{ + using cuda::mr::cuda_memory_resource::cuda_memory_resource; +}; +static_assert(cuda::mr::resource, ""); + // Ensure that we can only void test() diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp index 7cab309a33..dd480cc9f7 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp @@ -56,6 +56,13 @@ struct async_resource : public resource static_assert(cuda::mr::async_resource>, ""); static_assert(cuda::mr::async_resource>, ""); +// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 +struct derived_pinned_resource : cuda::mr::cuda_pinned_memory_resource +{ + using cuda::mr::cuda_pinned_memory_resource::cuda_pinned_memory_resource; +}; +static_assert(cuda::mr::resource, ""); + void test() { cuda::mr::cuda_pinned_memory_resource first{}; From aaf134000d9d368b134dc27f1e881ff36a694fa1 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Tue, 13 Aug 2024 03:41:26 -0700 Subject: [PATCH 17/33] fix `cuda_memory_resource` test for properly aligned memory (#2227) --- .../memory_resource/cuda_memory_resource/allocate.pass.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp index 2c88483e6c..073de36074 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp @@ -47,7 +47,8 @@ void test() ensure_device_ptr(ptr); // also check the alignment - const auto alignment = reinterpret_cast(ptr); + const auto address = reinterpret_cast(ptr); + const auto alignment = address & (~address + 1ULL); assert(alignment >= desired_alignment); res.deallocate(ptr, 42, desired_alignment); } From 098fb29af4555e601fe703234dddadcbb52b0713 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:33:10 -0700 Subject: [PATCH 18/33] Fix including `` when bad CUDA bfloat/half macros are used. (#2226) * Add test for bad macros being defined * Fix failing upon inclusion when bad macros are defined * Rather use explicit specializations and some evil hackery to get the complex interop to work * Fix typos * Inline everything * Move workarounds together * Use conversion functions instead of explicit specializations * Drop unneeded conversions --------- Co-authored-by: Michael Schellenberger Costa --- .../include/cuda/std/__complex/nvbf16.h | 87 ++++++++++++++++--- .../include/cuda/std/__complex/nvfp16.h | 87 ++++++++++++++++--- .../half_bfloat/complex.bad_macros.pass.cpp | 51 +++++++++++ 3 files changed, 203 insertions(+), 22 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h index d90c30e221..612ebba335 100644 --- a/libcudacxx/include/cuda/std/__complex/nvbf16.h +++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h @@ -63,6 +63,39 @@ struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false> typedef complex<__nv_bfloat16> _ComplexType; }; +// This is a workaround against the user defining macros __CUDA_NO_BFLOAT16_CONVERSIONS__ __CUDA_NO_BFLOAT16_OPERATORS__ +template <> +struct __complex_can_implicitly_construct<__nv_bfloat16, float> : true_type +{}; + +template <> +struct __complex_can_implicitly_construct<__nv_bfloat16, double> : true_type +{}; + +template <> +struct __complex_can_implicitly_construct : true_type +{}; + +template <> +struct __complex_can_implicitly_construct : true_type +{}; + +template +inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const _Tp& __value) noexcept +{ + return __value; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const float& __value) noexcept +{ + return __float2bfloat16(__value); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const double& __value) noexcept +{ + return __double2bfloat16(__value); +} + template <> class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__nv_bfloat16> { @@ -80,14 +113,14 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__ template ::value, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY complex(const complex<_Up>& __c) - : __repr_(static_cast(__c.real()), static_cast(__c.imag())) + : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag())) {} template ::value, int> = 0, __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int> = 0> _LIBCUDACXX_INLINE_VISIBILITY explicit complex(const complex<_Up>& __c) - : __repr_(static_cast(__c.real()), static_cast(__c.imag())) + : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag())) {} _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const value_type& __re) @@ -100,8 +133,8 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__ template _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const complex<_Up>& __c) { - __repr_.x = __c.real(); - __repr_.y = __c.imag(); + __repr_.x = __convert_to_bfloat16(__c.real()); + __repr_.y = __convert_to_bfloat16(__c.imag()); return *this; } @@ -155,24 +188,24 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__ _LIBCUDACXX_INLINE_VISIBILITY complex& operator+=(const value_type& __re) { - __repr_.x += __re; + __repr_.x = __hadd(__repr_.x, __re); return *this; } _LIBCUDACXX_INLINE_VISIBILITY complex& operator-=(const value_type& __re) { - __repr_.x -= __re; + __repr_.x = __hsub(__repr_.x, __re); return *this; } _LIBCUDACXX_INLINE_VISIBILITY complex& operator*=(const value_type& __re) { - __repr_.x *= __re; - __repr_.y *= __re; + __repr_.x = __hmul(__repr_.x, __re); + __repr_.y = __hmul(__repr_.y, __re); return *this; } _LIBCUDACXX_INLINE_VISIBILITY complex& operator/=(const value_type& __re) { - __repr_.x /= __re; - __repr_.y /= __re; + __repr_.x = __hdiv(__repr_.x, __re); + __repr_.y = __hdiv(__repr_.y, __re); return *this; } @@ -195,9 +228,41 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__ } }; +template <> // complex +template <> // complex<__half> +inline _LIBCUDACXX_INLINE_VISIBILITY complex::complex(const complex<__nv_bfloat16>& __c) + : __re_(__bfloat162float(__c.real())) + , __im_(__bfloat162float(__c.imag())) +{} + +template <> // complex +template <> // complex<__half> +inline _LIBCUDACXX_INLINE_VISIBILITY complex::complex(const complex<__nv_bfloat16>& __c) + : __re_(__bfloat162float(__c.real())) + , __im_(__bfloat162float(__c.imag())) +{} + +template <> // complex +template <> // complex<__nv_bfloat16> +inline _LIBCUDACXX_INLINE_VISIBILITY complex& complex::operator=(const complex<__nv_bfloat16>& __c) +{ + __re_ = __bfloat162float(__c.real()); + __im_ = __bfloat162float(__c.imag()); + return *this; +} + +template <> // complex +template <> // complex<__nv_bfloat16> +inline _LIBCUDACXX_INLINE_VISIBILITY complex& complex::operator=(const complex<__nv_bfloat16>& __c) +{ + __re_ = __bfloat162float(__c.real()); + __im_ = __bfloat162float(__c.imag()); + return *this; +} + inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 arg(__nv_bfloat16 __re) { - return _CUDA_VSTD::atan2f(__nv_bfloat16(0), __re); + return _CUDA_VSTD::atan2(__int2bfloat16_rn(0), __re); } // We have performance issues with some trigonometric functions with __nv_bfloat16 diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h index 7bd0ea0277..b3154a4b23 100644 --- a/libcudacxx/include/cuda/std/__complex/nvfp16.h +++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h @@ -60,6 +60,39 @@ struct __libcpp_complex_overload_traits<__half, false, false> typedef complex<__half> _ComplexType; }; +// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__ +template <> +struct __complex_can_implicitly_construct<__half, float> : true_type +{}; + +template <> +struct __complex_can_implicitly_construct<__half, double> : true_type +{}; + +template <> +struct __complex_can_implicitly_construct : true_type +{}; + +template <> +struct __complex_can_implicitly_construct : true_type +{}; + +template +inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const _Tp& __value) noexcept +{ + return __value; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const float& __value) noexcept +{ + return __float2half(__value); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const double& __value) noexcept +{ + return __double2half(__value); +} + template <> class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half> { @@ -77,14 +110,14 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half> template ::value, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY complex(const complex<_Up>& __c) - : __repr_(static_cast(__c.real()), static_cast(__c.imag())) + : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag())) {} template ::value, int> = 0, __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int> = 0> _LIBCUDACXX_INLINE_VISIBILITY explicit complex(const complex<_Up>& __c) - : __repr_(static_cast(__c.real()), static_cast(__c.imag())) + : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag())) {} _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const value_type& __re) @@ -97,8 +130,8 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half> template _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const complex<_Up>& __c) { - __repr_.x = __c.real(); - __repr_.y = __c.imag(); + __repr_.x = __convert_to_half(__c.real()); + __repr_.y = __convert_to_half(__c.imag()); return *this; } @@ -152,24 +185,24 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half> _LIBCUDACXX_INLINE_VISIBILITY complex& operator+=(const value_type& __re) { - __repr_.x += __re; + __repr_.x = __hadd(__repr_.x, __re); return *this; } _LIBCUDACXX_INLINE_VISIBILITY complex& operator-=(const value_type& __re) { - __repr_.x -= __re; + __repr_.x = __hsub(__repr_.x, __re); return *this; } _LIBCUDACXX_INLINE_VISIBILITY complex& operator*=(const value_type& __re) { - __repr_.x *= __re; - __repr_.y *= __re; + __repr_.x = __hmul(__repr_.x, __re); + __repr_.y = __hmul(__repr_.y, __re); return *this; } _LIBCUDACXX_INLINE_VISIBILITY complex& operator/=(const value_type& __re) { - __repr_.x /= __re; - __repr_.y /= __re; + __repr_.x = __hdiv(__repr_.x, __re); + __repr_.y = __hdiv(__repr_.y, __re); return *this; } @@ -192,9 +225,41 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half> } }; +template <> // complex +template <> // complex<__half> +inline _LIBCUDACXX_INLINE_VISIBILITY complex::complex(const complex<__half>& __c) + : __re_(__half2float(__c.real())) + , __im_(__half2float(__c.imag())) +{} + +template <> // complex +template <> // complex<__half> +inline _LIBCUDACXX_INLINE_VISIBILITY complex::complex(const complex<__half>& __c) + : __re_(__half2float(__c.real())) + , __im_(__half2float(__c.imag())) +{} + +template <> // complex +template <> // complex<__half> +inline _LIBCUDACXX_INLINE_VISIBILITY complex& complex::operator=(const complex<__half>& __c) +{ + __re_ = __half2float(__c.real()); + __im_ = __half2float(__c.imag()); + return *this; +} + +template <> // complex +template <> // complex<__half> +inline _LIBCUDACXX_INLINE_VISIBILITY complex& complex::operator=(const complex<__half>& __c) +{ + __re_ = __half2float(__c.real()); + __im_ = __half2float(__c.imag()); + return *this; +} + inline _LIBCUDACXX_INLINE_VISIBILITY __half arg(__half __re) { - return _CUDA_VSTD::atan2f(__half(0), __re); + return _CUDA_VSTD::atan2(__int2half_rn(0), __re); } // We have performance issues with some trigonometric functions with __half diff --git a/libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp b/libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp new file mode 100644 index 0000000000..0bd9da2fad --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#define __CUDA_NO_HALF_CONVERSIONS__ 1 +#define __CUDA_NO_HALF_OPERATORS__ 1 +#define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1 +#define __CUDA_NO_BFLOAT16_OPERATORS__ 1 +#define __CUDA_NO_HALF2_OPERATORS__ 1 +#define __CUDA_NO_BFLOAT162_OPERATORS__ 1 + +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ void test_assignment(cuda::std::complex v = {}) +{ + cuda::std::complex converting(v); + + cuda::std::complex assigning{}; + assigning = v; +} + +__host__ __device__ void test() +{ +#ifdef _LIBCUDACXX_HAS_NVFP16 + test_assignment<__half, float>(); + test_assignment<__half, double>(); + test_assignment(); + test_assignment(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test_assignment<__nv_bfloat16, float>(); + test_assignment<__nv_bfloat16, double>(); + test_assignment(); + test_assignment(); +#endif // _LIBCUDACXX_HAS_NVBF16 +} + +int main(int arg, char** argv) +{ + test(); + return 0; +} From d7c83fe654dd0e879b043c046ebe451614eb44eb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 14 Aug 2024 00:49:20 +0800 Subject: [PATCH 19/33] add license & fix long_description (#2211) --- python/cuda/README.md | 8 ++++++++ python/cuda/setup.py | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/python/cuda/README.md b/python/cuda/README.md index 58e2f908bb..e57f06e6b4 100644 --- a/python/cuda/README.md +++ b/python/cuda/README.md @@ -1,3 +1,11 @@ +# `cuda.cooperative`: Experimental CUDA Core Compute Library for Python + +## Documentation + +Please visit the documentation here: https://nvidia.github.io/cccl/python.html. + +## Local development + ```bash pip3 install -e .[test] pytest -v ./tests/device/ diff --git a/python/cuda/setup.py b/python/cuda/setup.py index e76d585162..f7eff80bda 100644 --- a/python/cuda/setup.py +++ b/python/cuda/setup.py @@ -25,6 +25,10 @@ del __version__ +with open("README.md") as f: + long_description = f.read() + + class CustomBuildCommand(build_py): def run(self): self.run_command('package_cccl') @@ -62,6 +66,8 @@ def run(self): name="cuda-cooperative", version=ver, description="Experimental Core Library for CUDA Python", + long_description=long_description, + long_description_content_type="text/markdown", author="NVIDIA Corporation", classifiers=[ "Programming Language :: Python :: 3 :: Only", @@ -85,5 +91,7 @@ def run(self): 'build_py': CustomBuildCommand, 'bdist_wheel': CustomWheelBuild, }, - include_package_data=True + include_package_data=True, + license="Apache-2.0 with LLVM exception", + license_files = ('../../LICENSE',), ) From 64d28d1c3caaf22bd5a044db9317b2c8c6c70d7a Mon Sep 17 00:00:00 2001 From: Georgii Evtushenko Date: Tue, 13 Aug 2024 17:02:57 -0700 Subject: [PATCH 20/33] Extract reduction kernels into NVRTC-compilable header (#2231) --- cub/cub/agent/agent_reduce.cuh | 8 +- cub/cub/device/dispatch/dispatch_reduce.cuh | 228 +---------------- cub/cub/device/dispatch/kernels/reduce.cuh | 268 ++++++++++++++++++++ cub/test/catch2_test_nvrtc.cu | 1 + 4 files changed, 273 insertions(+), 232 deletions(-) create mode 100644 cub/cub/device/dispatch/kernels/reduce.cuh diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index 3492bd5f41..94b90774e5 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -53,8 +53,6 @@ #include -#include - _CCCL_SUPPRESS_DEPRECATED_PUSH #include _CCCL_SUPPRESS_DEPRECATED_POP @@ -147,7 +145,7 @@ struct AgentReduce // Wrap the native input pointer with CacheModifiedInputIterator // or directly use the supplied input iterator type using WrappedInputIteratorT = - ::cuda::std::_If::value, + ::cuda::std::_If<::cuda::std::is_pointer::value, CacheModifiedInputIterator, InputIteratorT>; @@ -160,8 +158,8 @@ struct AgentReduce // Can vectorize according to the policy if the input iterator is a native // pointer to a primitive type static constexpr bool ATTEMPT_VECTORIZATION = - (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && (std::is_pointer::value) - && Traits::PRIMITIVE; + (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) + && (::cuda::std::is_pointer::value) && Traits::PRIMITIVE; static constexpr CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 3b3c0c903e..e3e3844a3f 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -45,6 +45,7 @@ #endif // no system header #include +#include #include #include #include @@ -66,233 +67,6 @@ _CCCL_SUPPRESS_DEPRECATED_POP CUB_NAMESPACE_BEGIN -namespace detail -{ -namespace reduce -{ - -/** - * All cub::DeviceReduce::* algorithms are using the same implementation. Some of them, however, - * should use initial value only for empty problems. If this struct is used as initial value with - * one of the `DeviceReduce` algorithms, the `init` value wrapped by this struct will only be used - * for empty problems; it will not be incorporated into the aggregate of non-empty problems. - */ -template -struct empty_problem_init_t -{ - T init; - - _CCCL_HOST_DEVICE operator T() const - { - return init; - } -}; - -/** - * @brief Applies initial value to the block aggregate and stores the result to the output iterator. - * - * @param d_out Iterator to the output aggregate - * @param reduction_op Binary reduction functor - * @param init Initial value - * @param block_aggregate Aggregate value computed by the block - */ -template -_CCCL_HOST_DEVICE void -finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitT init, AccumT block_aggregate) -{ - *d_out = reduction_op(init, block_aggregate); -} - -/** - * @brief Ignores initial value and stores the block aggregate to the output iterator. - * - * @param d_out Iterator to the output aggregate - * @param block_aggregate Aggregate value computed by the block - */ -template -_CCCL_HOST_DEVICE void -finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t, AccumT block_aggregate) -{ - *d_out = block_aggregate; -} -} // namespace reduce -} // namespace detail - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * @brief Reduce region kernel entry point (multi-block). Computes privatized - * reductions, one per thread block. - * - * @tparam ChainedPolicyT - * Chained tuning policy - * - * @tparam InputIteratorT - * Random-access input iterator type for reading input items @iterator - * - * @tparam OffsetT - * Signed integer type for global offsets - * - * @tparam ReductionOpT - * Binary reduction functor type having member - * `auto operator()(const T &a, const U &b)` - * - * @tparam InitT - * Initial value type - * - * @tparam AccumT - * Accumulator type - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input data items - * - * @param[in] even_share - * Even-share descriptor for mapping an equal number of tiles onto each - * thread block - * - * @param[in] reduction_op - * Binary reduction functor - */ -template -CUB_DETAIL_KERNEL_ATTRIBUTES -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel( - InputIteratorT d_in, - AccumT* d_out, - OffsetT num_items, - GridEvenShare even_share, - ReductionOpT reduction_op, - TransformOpT transform_op) -{ - // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Consume input tiles - AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeTiles(even_share); - - // Output result - if (threadIdx.x == 0) - { - detail::uninitialized_copy_single(d_out + blockIdx.x, block_aggregate); - } -} - -/** - * @brief Reduce a single tile kernel entry point (single-block). Can be used - * to aggregate privatized thread block reductions from a previous - * multi-block reduction pass. - * - * @tparam ChainedPolicyT - * Chained tuning policy - * - * @tparam InputIteratorT - * Random-access input iterator type for reading input items @iterator - * - * @tparam OutputIteratorT - * Output iterator type for recording the reduced aggregate @iterator - * - * @tparam OffsetT - * Signed integer type for global offsets - * - * @tparam ReductionOpT - * Binary reduction functor type having member - * `T operator()(const T &a, const U &b)` - * - * @tparam InitT - * Initial value type - * - * @tparam AccumT - * Accumulator type - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input data items - * - * @param[in] reduction_op - * Binary reduction functor - * - * @param[in] init - * The initial value of the reduction - */ -template -CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( - int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), - 1) void DeviceReduceSingleTileKernel(InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init, - TransformOpT transform_op) -{ - // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Check if empty problem - if (num_items == 0) - { - if (threadIdx.x == 0) - { - *d_out = init; - } - - return; - } - - // Consume input tiles - AccumT block_aggregate = - AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeRange(OffsetT(0), num_items); - - // Output result - if (threadIdx.x == 0) - { - detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate); - } -} - /// Normalize input iterator to segment offset template _CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/) diff --git a/cub/cub/device/dispatch/kernels/reduce.cuh b/cub/cub/device/dispatch/kernels/reduce.cuh new file mode 100644 index 0000000000..174b262c39 --- /dev/null +++ b/cub/cub/device/dispatch/kernels/reduce.cuh @@ -0,0 +1,268 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +CUB_NAMESPACE_BEGIN + +namespace detail +{ +namespace reduce +{ + +/** + * All cub::DeviceReduce::* algorithms are using the same implementation. Some of them, however, + * should use initial value only for empty problems. If this struct is used as initial value with + * one of the `DeviceReduce` algorithms, the `init` value wrapped by this struct will only be used + * for empty problems; it will not be incorporated into the aggregate of non-empty problems. + */ +template +struct empty_problem_init_t +{ + T init; + + _CCCL_HOST_DEVICE operator T() const + { + return init; + } +}; + +/** + * @brief Applies initial value to the block aggregate and stores the result to the output iterator. + * + * @param d_out Iterator to the output aggregate + * @param reduction_op Binary reduction functor + * @param init Initial value + * @param block_aggregate Aggregate value computed by the block + */ +template +_CCCL_HOST_DEVICE void +finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitT init, AccumT block_aggregate) +{ + *d_out = reduction_op(init, block_aggregate); +} + +/** + * @brief Ignores initial value and stores the block aggregate to the output iterator. + * + * @param d_out Iterator to the output aggregate + * @param block_aggregate Aggregate value computed by the block + */ +template +_CCCL_HOST_DEVICE void +finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t, AccumT block_aggregate) +{ + *d_out = block_aggregate; +} +} // namespace reduce +} // namespace detail + +/** + * @brief Reduce region kernel entry point (multi-block). Computes privatized + * reductions, one per thread block. + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam InputIteratorT + * Random-access input iterator type for reading input items @iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ReductionOpT + * Binary reduction functor type having member + * `auto operator()(const T &a, const U &b)` + * + * @tparam InitT + * Initial value type + * + * @tparam AccumT + * Accumulator type + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input data items + * + * @param[in] even_share + * Even-share descriptor for mapping an equal number of tiles onto each + * thread block + * + * @param[in] reduction_op + * Binary reduction functor + */ +template +CUB_DETAIL_KERNEL_ATTRIBUTES +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel( + InputIteratorT d_in, + AccumT* d_out, + OffsetT num_items, + GridEvenShare even_share, + ReductionOpT reduction_op, + TransformOpT transform_op) +{ + // Thread block type for reducing input tiles + using AgentReduceT = + AgentReduce; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Consume input tiles + AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeTiles(even_share); + + // Output result + if (threadIdx.x == 0) + { + detail::uninitialized_copy_single(d_out + blockIdx.x, block_aggregate); + } +} + +/** + * @brief Reduce a single tile kernel entry point (single-block). Can be used + * to aggregate privatized thread block reductions from a previous + * multi-block reduction pass. + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam InputIteratorT + * Random-access input iterator type for reading input items @iterator + * + * @tparam OutputIteratorT + * Output iterator type for recording the reduced aggregate @iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ReductionOpT + * Binary reduction functor type having member + * `T operator()(const T &a, const U &b)` + * + * @tparam InitT + * Initial value type + * + * @tparam AccumT + * Accumulator type + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input data items + * + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] init + * The initial value of the reduction + */ +template +CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( + int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), + 1) void DeviceReduceSingleTileKernel(InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init, + TransformOpT transform_op) +{ + // Thread block type for reducing input tiles + using AgentReduceT = + AgentReduce; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Check if empty problem + if (num_items == 0) + { + if (threadIdx.x == 0) + { + *d_out = init; + } + + return; + } + + // Consume input tiles + AccumT block_aggregate = + AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeRange(OffsetT(0), num_items); + + // Output result + if (threadIdx.x == 0) + { + detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate); + } +} + +CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu index 0e1b232ff6..466c3fa978 100644 --- a/cub/test/catch2_test_nvrtc.cu +++ b/cub/test/catch2_test_nvrtc.cu @@ -54,6 +54,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]") "#include \n" "#include \n" "#include \n" + "#include \n" " \n" "extern \"C\" __global__ void kernel(int *ptr, int *errors) \n" "{ \n" From 6213a5e68a7158799834e70cf4865842107d4e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Dominiak?= Date: Wed, 14 Aug 2024 09:45:14 +0200 Subject: [PATCH 21/33] Implement `` (#1496) --- .../standard_api/utility_library.rst | 4 + .../standard_api/utility_library/bitset.rst | 14 + .../include/cuda/std/__algorithm/copy.h | 52 +- .../include/cuda/std/__algorithm/copy_n.h | 2 +- .../__bit_reference => __bit/reference.h} | 863 ++++++------ libcudacxx/include/cuda/std/__cccl/compiler.h | 3 + .../include/cuda/std/__cccl/diagnostic.h | 20 +- libcudacxx/include/cuda/std/bitset | 1071 ++++++++++++++ .../cuda/std/detail/libcxx/include/__string | 1246 +++++++++++++++++ .../cuda/std/detail/libcxx/include/bitset | 1027 -------------- .../cuda/std/detail/libcxx/include/cstddef | 2 +- .../bitset.cons/char_ptr_ctor.pass.cpp | 159 +++ .../bitset.cons/default.pass.cpp | 64 + .../bitset.cons/string_ctor.pass.cpp | 196 +++ .../bitset.cons/string_view_ctor.pass.cpp | 201 +++ .../bitset.cons/ull_ctor.pass.cpp | 72 + .../bitset.members/all.pass.cpp | 55 + .../bitset.members/any.pass.cpp | 58 + .../bitset.members/count.pass.cpp | 69 + .../bitset.members/flip_all.pass.cpp | 65 + .../flip_one.out_of_range.pass.cpp | 52 + .../bitset.members/flip_one.pass.cpp | 65 + .../bitset.members/index.pass.cpp | 81 ++ .../bitset.members/index_const.pass.cpp | 72 + .../bitset.members/left_shift.pass.cpp | 71 + .../bitset.members/left_shift_eq.pass.cpp | 89 ++ .../bitset.members/none.pass.cpp | 58 + .../bitset.members/not_all.pass.cpp | 64 + .../bitset.members/op_and_eq.pass.cpp | 71 + .../bitset.members/op_eq_eq.pass.cpp | 62 + .../bitset.members/op_or_eq.pass.cpp | 78 ++ .../bitset.members/op_xor_eq.pass.cpp | 77 + .../bitset.members/reset_all.pass.cpp | 59 + .../reset_one.out_of_range.pass.cpp | 52 + .../bitset.members/reset_one.pass.cpp | 75 + .../bitset.members/right_shift.pass.cpp | 69 + .../bitset.members/right_shift_eq.pass.cpp | 91 ++ .../bitset.members/set_all.pass.cpp | 58 + .../set_one.out_of_range.pass.cpp | 52 + .../bitset.members/set_one.pass.cpp | 60 + .../bitset.members/size.pass.cpp | 46 + .../bitset.members/test.out_of_range.pass.cpp | 52 + .../bitset.members/test.pass.cpp | 60 + .../bitset.members/to_string.pass.cpp | 185 +++ .../bitset.members/to_ullong.pass.cpp | 75 + .../bitset.members/to_ulong.pass.cpp | 74 + .../bitset.operators/op_and.pass.cpp | 60 + .../bitset.operators/op_not.pass.cpp | 60 + .../bitset.operators/op_or.pass.cpp | 60 + .../bitset.operators/stream_in.pass.cpp | 100 ++ .../bitset.operators/stream_out.pass.cpp | 42 + .../template.bitset/bitset_test_cases.h | 163 +++ .../template.bitset/includes.pass.cpp | 35 + libcudacxx/test/support/test_macros.h | 1 + 54 files changed, 6112 insertions(+), 1500 deletions(-) create mode 100644 docs/libcudacxx/standard_api/utility_library/bitset.rst rename libcudacxx/include/cuda/std/{detail/libcxx/include/__bit_reference => __bit/reference.h} (56%) create mode 100644 libcudacxx/include/cuda/std/bitset create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__string delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/bitset create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp diff --git a/docs/libcudacxx/standard_api/utility_library.rst b/docs/libcudacxx/standard_api/utility_library.rst index 12582dc146..4df28701a3 100644 --- a/docs/libcudacxx/standard_api/utility_library.rst +++ b/docs/libcudacxx/standard_api/utility_library.rst @@ -7,6 +7,7 @@ Utility Library :hidden: :maxdepth: 1 + utility_library/bitset utility_library/expected utility_library/functional utility_library/optional @@ -26,6 +27,9 @@ the information about the individual features for details. * - Header - Content - Availability + * - :ref:`libcudacxx-standard-api-utility-bitset` + - Fixed-size sequence of bits + - CCCL 2.8.0 * - :ref:`libcudacxx-standard-api-utility-expected` - Optional value with error channel - CCCL 2.3.0 / CUDA 12.4 diff --git a/docs/libcudacxx/standard_api/utility_library/bitset.rst b/docs/libcudacxx/standard_api/utility_library/bitset.rst new file mode 100644 index 0000000000..a621cb01ab --- /dev/null +++ b/docs/libcudacxx/standard_api/utility_library/bitset.rst @@ -0,0 +1,14 @@ +.. _libcudacxx-standard-api-utility-bitset: + +```` +====================== + +Extensions +---------- + +- All features of ```` are made constexpr in C++14 onwards + +Restrictions +------------ + +- On device no exceptions are thrown in case of a bad access. diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h index 03e10fe98c..883cbc4632 100644 --- a/libcudacxx/include/cuda/std/__algorithm/copy.h +++ b/libcudacxx/include/cuda/std/__algorithm/copy.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,13 @@ template inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool __dispatch_memmove(_Up* __result, _Tp* __first, const size_t __n) { + // This is a pessimisation, but there's no way to do the code path detection correctly before GCC 9.0. + // __builtin_memmove is also illegal in constexpr there, so... just always assume we are constant evaluated, + // and let the optimizer *maybe* recover some of the perf. +#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900 + return false; +#endif + if (__libcpp_is_constant_evaluated()) { return false; @@ -66,6 +74,35 @@ __dispatch_memmove(_Up* __result, _Tp* __first, const size_t __n) } } +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool +__constexpr_tail_overlap_fallback(_Tp* __first, _Up* __needle, _Tp* __last) +{ + while (__first != __last) + { + if (__first == __needle) + { + return true; + } + ++__first; + } + return false; +} + +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool +__constexpr_tail_overlap(_Tp* __first, _Up* __needle, _Tp* __last) +{ + _LIBCUDACXX_UNUSED_VAR(__last); +#if __has_builtin(__builtin_constant_p) || defined(_CCCL_COMPILER_GCC) + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return __builtin_constant_p(__first < __needle) && __first < __needle;), + (return __constexpr_tail_overlap_fallback(__first, __needle, __last);)) +#else + return __constexpr_tail_overlap_fallback(__first, __needle, __last); +#endif +} + template 0; --__i) + { + *(__result + __i - 1) = *(__first + __i - 1); + } + } + else { - *(__result + __i) = *(__first + __i); + for (ptrdiff_t __i = 0; __i < __n; ++__i) + { + *(__result + __i) = *(__first + __i); + } } } return {__last, __result + __n}; diff --git a/libcudacxx/include/cuda/std/__algorithm/copy_n.h b/libcudacxx/include/cuda/std/__algorithm/copy_n.h index a6c62e920d..eb9e28873d 100644 --- a/libcudacxx/include/cuda/std/__algorithm/copy_n.h +++ b/libcudacxx/include/cuda/std/__algorithm/copy_n.h @@ -55,7 +55,7 @@ template ::value, int> = 0> -inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _OutputIterator +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result) { using _IntegralSize = decltype(__convert_to_integral(__orig_n)); diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference b/libcudacxx/include/cuda/std/__bit/reference.h similarity index 56% rename from libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference rename to libcudacxx/include/cuda/std/__bit/reference.h index 9c64111069..29482f7e25 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference +++ b/libcudacxx/include/cuda/std/__bit/reference.h @@ -10,9 +10,7 @@ #ifndef _LIBCUDACXX___BIT_REFERENCE #define _LIBCUDACXX___BIT_REFERENCE -##include -#include -#include +#include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header @@ -22,27 +20,36 @@ # pragma system_header #endif // no system header - _LIBCUDACXX_PUSH_MACROS -#include <__undef_macros> +#include +#include +#include +// TODO: modularize bit a bit +#include +// #include +// #include +// #include +#include +#include +#include +#include +#include +#include + +_CCCL_PUSH_MACROS + +_LIBCUDACXX_BEGIN_NAMESPACE_STD - _LIBCUDACXX_BEGIN_NAMESPACE_STD - -template -class __bit_iterator; template class __bit_const_reference; -template -struct __has_storage_type -{ - static const bool value = false; -}; +template +class __bit_iterator; -template ::value> +template class __bit_reference { - typedef typename _Cp::__storage_type __storage_type; - typedef typename _Cp::__storage_pointer __storage_pointer; + using __storage_type = typename _Cp::__storage_type; + using __storage_pointer = typename _Cp::__storage_pointer; __storage_pointer __seg_; __storage_type __mask_; @@ -53,18 +60,21 @@ class __bit_reference friend class __bit_iterator<_Cp, false>; public: - _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(const __bit_reference&) = default; + using __container = typename _Cp::__self; + + _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_reference(const __bit_reference&) = default; - _LIBCUDACXX_INLINE_VISIBILITY operator bool() const noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 operator bool() const noexcept { return static_cast(*__seg_ & __mask_); } - _LIBCUDACXX_INLINE_VISIBILITY bool operator~() const noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator~() const noexcept { return !static_cast(*this); } - _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(bool __x) noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_reference& + operator=(bool __x) noexcept { if (__x) { @@ -77,68 +87,84 @@ class __bit_reference return *this; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(const __bit_reference& __x) noexcept +#if _CCCL_STD_VER >= 2023 + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const __bit_reference& + operator=(bool __x) const noexcept + { + if (__x) + { + *__seg_ |= __mask_; + } + else + { + *__seg_ &= ~__mask_; + } + return *this; + } +#endif // C++23+ + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_reference& + operator=(const __bit_reference& __x) noexcept { return operator=(static_cast(__x)); } - _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept { *__seg_ ^= __mask_; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> operator&() const noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> + operator&() const noexcept + { + return __bit_iterator<_Cp, false>(__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__mask_))); + } + + friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept + { + bool __t = __x; + __x = __y; + __y = __t; + } + + template + friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept + { + bool __t = __x; + __x = __y; + __y = __t; + } + + friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + swap(__bit_reference<_Cp> __x, bool& __y) noexcept + { + bool __t = __x; + __x = __y; + __y = __t; + } + + friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + swap(bool& __x, __bit_reference<_Cp> __y) noexcept { - return __bit_iterator<_Cp, false>(__seg_, static_cast(__libcpp_ctz(__mask_))); + bool __t = __x; + __x = __y; + __y = __t; } private: - _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(__storage_pointer __s, __storage_type __m) noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY + _CCCL_CONSTEXPR_CXX14 explicit __bit_reference(__storage_pointer __s, __storage_type __m) noexcept : __seg_(__s) , __mask_(__m) {} }; -template -class __bit_reference<_Cp, false> -{}; - -template -inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept -{ - bool __t = __x; - __x = __y; - __y = __t; -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept -{ - bool __t = __x; - __x = __y; - __y = __t; -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, bool& __y) noexcept -{ - bool __t = __x; - __x = __y; - __y = __t; -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY void swap(bool& __x, __bit_reference<_Cp> __y) noexcept -{ - bool __t = __x; - __x = __y; - __y = __t; -} - template class __bit_const_reference { - typedef typename _Cp::__storage_type __storage_type; - typedef typename _Cp::__const_storage_pointer __storage_pointer; + using __storage_type = typename _Cp::__storage_type; + using __storage_pointer = typename _Cp::__const_storage_pointer; __storage_pointer __seg_; __storage_type __mask_; @@ -147,25 +173,30 @@ class __bit_const_reference friend class __bit_iterator<_Cp, true>; public: - _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_const_reference&) = default; + using __container = typename _Cp::__self; + + _LIBCUDACXX_HIDE_FROM_ABI __bit_const_reference(const __bit_const_reference&) = default; - _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 + __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept : __seg_(__x.__seg_) , __mask_(__x.__mask_) {} - _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept { return static_cast(*__seg_ & __mask_); } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, true> operator&() const noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, true> + operator&() const noexcept { - return __bit_iterator<_Cp, true>(__seg_, static_cast(__libcpp_ctz(__mask_))); + return __bit_iterator<_Cp, true>(__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__mask_))); } private: - _LIBCUDACXX_INLINE_VISIBILITY constexpr __bit_const_reference(__storage_pointer __s, __storage_type __m) noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __bit_const_reference( + __storage_pointer __s, __storage_type __m) noexcept : __seg_(__s) , __mask_(__m) {} @@ -173,262 +204,66 @@ class __bit_const_reference __bit_const_reference& operator=(const __bit_const_reference&) = delete; }; -// find +// fill_n -template -__bit_iterator<_Cp, _IsConst> __find_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) +template +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void +__fill_n_impl(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { - typedef __bit_iterator<_Cp, _IsConst> _It; - typedef typename _It::__storage_type __storage_type; - static const int __bits_per_word = _It::__bits_per_word; - // do first partial word - if (__first.__ctz_ != 0) - { - __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); - __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n); - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - __storage_type __b = *__first.__seg_ & __m; - if (__b) - { - return _It(__first.__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__b))); - } - if (__n == __dn) - { - return __first + __n; - } - __n -= __dn; - ++__first.__seg_; - } - // do middle whole words - for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word) - { - if (*__first.__seg_) - { - return _It(__first.__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(*__first.__seg_))); - } - } - // do last partial word - if (__n > 0) - { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - __storage_type __b = *__first.__seg_ & __m; - if (__b) - { - return _It(__first.__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__b))); - } - } - return _It(__first.__seg_, static_cast(__n)); -} + using _It = __bit_iterator<_Cp, false>; + using __storage_type = typename _It::__storage_type; -template -__bit_iterator<_Cp, _IsConst> __find_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) -{ - typedef __bit_iterator<_Cp, _IsConst> _It; - typedef typename _It::__storage_type __storage_type; const int __bits_per_word = _It::__bits_per_word; // do first partial word if (__first.__ctz_ != 0) { __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); - __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n); + __storage_type __dn = (_CUDA_VSTD::min)(__clz_f, static_cast<__storage_type>(__n)); __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - __storage_type __b = ~*__first.__seg_ & __m; - if (__b) + if (_FillVal) { - return _It(__first.__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__b))); + *__first.__seg_ |= __m; } - if (__n == __dn) + else { - return __first + __n; + *__first.__seg_ &= ~__m; } - __n -= __dn; + __n -= __dn.__data; ++__first.__seg_; } // do middle whole words - for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word) - { - __storage_type __b = ~*__first.__seg_; - if (__b) - { - return _It(__first.__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__b))); - } - } + __storage_type __nw = __n / __bits_per_word; + _CUDA_VSTD::fill_n(_CUDA_VSTD::__to_address(__first.__seg_), __nw, _FillVal ? ~static_cast<__storage_type>(0) : 0); + __n -= (__nw * __bits_per_word).__data; // do last partial word if (__n > 0) { + __first.__seg_ += __nw.__data; __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - __storage_type __b = ~*__first.__seg_ & __m; - if (__b) + if (_FillVal) { - return _It(__first.__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__b))); + *__first.__seg_ |= __m; + } + else + { + *__first.__seg_ &= ~__m; } - } - return _It(__first.__seg_, static_cast(__n)); -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, _IsConst> -find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_) -{ - if (static_cast(__value_)) - { - return __find_bool_true(__first, static_cast(__last - __first)); - } - return __find_bool_false(__first, static_cast(__last - __first)); -} - -// count - -template -typename __bit_iterator<_Cp, _IsConst>::difference_type -__count_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) -{ - typedef __bit_iterator<_Cp, _IsConst> _It; - typedef typename _It::__storage_type __storage_type; - typedef typename _It::difference_type difference_type; - const int __bits_per_word = _It::__bits_per_word; - difference_type __r = 0; - // do first partial word - if (__first.__ctz_ != 0) - { - __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); - __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n); - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - __r = _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m); - __n -= __dn; - ++__first.__seg_; - } - // do middle whole words - for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word) - { - __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_); - } - // do last partial word - if (__n > 0) - { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m); - } - return __r; -} - -template -typename __bit_iterator<_Cp, _IsConst>::difference_type -__count_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) -{ - typedef __bit_iterator<_Cp, _IsConst> _It; - typedef typename _It::__storage_type __storage_type; - typedef typename _It::difference_type difference_type; - const int __bits_per_word = _It::__bits_per_word; - difference_type __r = 0; - // do first partial word - if (__first.__ctz_ != 0) - { - __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); - __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n); - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - __r = _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m); - __n -= __dn; - ++__first.__seg_; - } - // do middle whole words - for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word) - { - __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_); - } - // do last partial word - if (__n > 0) - { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m); - } - return __r; -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY typename __bit_iterator<_Cp, _IsConst>::difference_type -count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_) -{ - if (static_cast(__value_)) - { - return __count_bool_true(__first, static_cast(__last - __first)); - } - return __count_bool_false(__first, static_cast(__last - __first)); -} - -// fill_n - -template -void __fill_n_false(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) -{ - typedef __bit_iterator<_Cp, false> _It; - typedef typename _It::__storage_type __storage_type; - const int __bits_per_word = _It::__bits_per_word; - // do first partial word - if (__first.__ctz_ != 0) - { - __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); - __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n); - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - *__first.__seg_ &= ~__m; - __n -= __dn; - ++__first.__seg_; - } - // do middle whole words - __storage_type __nw = __n / __bits_per_word; - _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), 0, __nw * sizeof(__storage_type)); - __n -= __nw * __bits_per_word; - // do last partial word - if (__n > 0) - { - __first.__seg_ += __nw; - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - *__first.__seg_ &= ~__m; - } -} - -template -void __fill_n_true(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) -{ - typedef __bit_iterator<_Cp, false> _It; - typedef typename _It::__storage_type __storage_type; - const int __bits_per_word = _It::__bits_per_word; - // do first partial word - if (__first.__ctz_ != 0) - { - __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); - __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n); - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - *__first.__seg_ |= __m; - __n -= __dn; - ++__first.__seg_; - } - // do middle whole words - __storage_type __nw = __n / __bits_per_word; - _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), -1, __nw * sizeof(__storage_type)); - __n -= __nw * __bits_per_word; - // do last partial word - if (__n > 0) - { - __first.__seg_ += __nw; - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - *__first.__seg_ |= __m; } } template -inline _LIBCUDACXX_INLINE_VISIBILITY void -fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value_) +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void +fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value) { if (__n > 0) { - if (__value_) + if (__value) { - __fill_n_true(__first, __n); + _CUDA_VSTD::__fill_n_impl(__first, __n); } else { - __fill_n_false(__first, __n); + _CUDA_VSTD::__fill_n_impl(__first, __n); } } } @@ -436,21 +271,24 @@ fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __v // fill template -inline _LIBCUDACXX_INLINE_VISIBILITY void -fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value_) +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void +fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value) { - _CUDA_VSTD::fill_n(__first, static_cast(__last - __first), __value_); + _CUDA_VSTD::fill_n(__first, static_cast(__last - __first), __value); } // copy template -__bit_iterator<_Cp, false> __copy_aligned( - __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> +__copy_aligned(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { - typedef __bit_iterator<_Cp, _IsConst> _In; - typedef typename _In::difference_type difference_type; - typedef typename _In::__storage_type __storage_type; + using _In = __bit_iterator<_Cp, _IsConst>; + using difference_type = typename _In::difference_type; + using __storage_type = typename _In::__storage_type; + const int __bits_per_word = _In::__bits_per_word; difference_type __n = __last - __first; if (__n > 0) @@ -473,15 +311,13 @@ __bit_iterator<_Cp, false> __copy_aligned( // __first.__ctz_ == 0; // do middle words __storage_type __nw = __n / __bits_per_word; - _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_), - _CUDA_VSTD::__to_raw_pointer(__first.__seg_), - __nw * sizeof(__storage_type)); - __n -= __nw * __bits_per_word; - __result.__seg_ += __nw; + _CUDA_VSTD::copy_n(_CUDA_VSTD::__to_address(__first.__seg_), __nw.__data, _CUDA_VSTD::__to_address(__result.__seg_)); + __result.__seg_ += __nw.__data; + __n -= (__nw * __bits_per_word).__data; // do last word if (__n > 0) { - __first.__seg_ += __nw; + __first.__seg_ += __nw.__data; __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); __storage_type __b = *__first.__seg_ & __m; *__result.__seg_ &= ~__m; @@ -493,14 +329,16 @@ __bit_iterator<_Cp, false> __copy_aligned( } template -__bit_iterator<_Cp, false> __copy_unaligned( +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> +__copy_unaligned( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - typedef __bit_iterator<_Cp, _IsConst> _In; - typedef typename _In::difference_type difference_type; - typedef typename _In::__storage_type __storage_type; - static const int __bits_per_word = _In::__bits_per_word; - difference_type __n = __last - __first; + using _In = __bit_iterator<_Cp, _IsConst>; + using difference_type = typename _In::difference_type; + using __storage_type = typename _In::__storage_type; + + const int __bits_per_word = _In::__bits_per_word; + difference_type __n = __last - __first; if (__n > 0) { // do first word @@ -523,9 +361,9 @@ __bit_iterator<_Cp, false> __copy_unaligned( { *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_); } - __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word; - __result.__ctz_ = static_cast((__ddn + __result.__ctz_) % __bits_per_word); - __dn -= __ddn; + __result.__seg_ += ((__ddn + __result.__ctz_) / __bits_per_word).__data; + __result.__ctz_ = static_cast(((__ddn + __result.__ctz_) % __bits_per_word).__data); + __dn -= __ddn.__data; if (__dn > 0) { __m = ~__storage_type(0) >> (__bits_per_word - __dn); @@ -558,9 +396,9 @@ __bit_iterator<_Cp, false> __copy_unaligned( __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn)); *__result.__seg_ &= ~__m; *__result.__seg_ |= __b << __result.__ctz_; - __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word; - __result.__ctz_ = static_cast((__dn + __result.__ctz_) % __bits_per_word); - __n -= __dn; + __result.__seg_ += ((__dn + __result.__ctz_) / __bits_per_word).__data; + __result.__ctz_ = static_cast(((__dn + __result.__ctz_) % __bits_per_word).__data); + __n -= __dn.__data; if (__n > 0) { __m = ~__storage_type(0) >> (__bits_per_word - __n); @@ -574,25 +412,27 @@ __bit_iterator<_Cp, false> __copy_unaligned( } template -inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { if (__first.__ctz_ == __result.__ctz_) { - return __copy_aligned(__first, __last, __result); + return _CUDA_VSTD::__copy_aligned(__first, __last, __result); } - return __copy_unaligned(__first, __last, __result); + return _CUDA_VSTD::__copy_unaligned(__first, __last, __result); } // copy_backward template -__bit_iterator<_Cp, false> __copy_backward_aligned( +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> +__copy_backward_aligned( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - typedef __bit_iterator<_Cp, _IsConst> _In; - typedef typename _In::difference_type difference_type; - typedef typename _In::__storage_type __storage_type; + using _In = __bit_iterator<_Cp, _IsConst>; + using difference_type = typename _In::difference_type; + using __storage_type = typename _In::__storage_type; + const int __bits_per_word = _In::__bits_per_word; difference_type __n = __last - __first; if (__n > 0) @@ -614,18 +454,44 @@ __bit_iterator<_Cp, false> __copy_backward_aligned( // __result.__ctz_ == 0 || __n == 0 // do middle words __storage_type __nw = __n / __bits_per_word; - __result.__seg_ -= __nw; - __last.__seg_ -= __nw; - _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_), - _CUDA_VSTD::__to_raw_pointer(__last.__seg_), - __nw * sizeof(__storage_type)); - __n -= __nw * __bits_per_word; + __result.__seg_ -= __nw.__data; + __last.__seg_ -= __nw.__data; + _CUDA_VSTD::copy_n(_CUDA_VSTD::__to_address(__last.__seg_), __nw.__data, _CUDA_VSTD::__to_address(__result.__seg_)); + __n -= (__nw * __bits_per_word).__data; // do last word if (__n > 0) { __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n); - __storage_type __b = *--__last.__seg_ & __m; - *--__result.__seg_ &= ~__m; +#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900 + // workaround for GCC pre-9 being really bad at tracking one-past-the-end pointers at constexpr + // can't check for is-constant-evaluated, because GCC pre-9 also lacks _that_. + if (__last.__seg_ == __first.__seg_ + 1) + { + __last.__seg_ = __first.__seg_; + } + else + { + --__last.__seg_; + } +#else // ^^ GCC < 9 ^^ | vv !GCC || GCC >= 9 vv + --__last.__seg_; +#endif // !GCC || GCC >= 9 + __storage_type __b = *__last.__seg_ & __m; +#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900 + // workaround for GCC pre-9 being really bad at tracking one-past-the-end pointers at constexpr + // can't check for is-constant-evaluated, because GCC pre-9 also lacks _that_. + if (__result.__seg_ == __first.__seg_ + 1) + { + __result.__seg_ = __first.__seg_; + } + else + { + --__result.__seg_; + } +#else // ^^ GCC < 9 ^^ | vv !GCC || GCC >= 9 vv + --__result.__seg_; +#endif // !GCC || GCC >= 9 + *__result.__seg_ &= ~__m; *__result.__seg_ |= __b; __result.__ctz_ = static_cast(-__n & (__bits_per_word - 1)); } @@ -634,12 +500,14 @@ __bit_iterator<_Cp, false> __copy_backward_aligned( } template -__bit_iterator<_Cp, false> __copy_backward_unaligned( +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> +__copy_backward_unaligned( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - typedef __bit_iterator<_Cp, _IsConst> _In; - typedef typename _In::difference_type difference_type; - typedef typename _In::__storage_type __storage_type; + using _In = __bit_iterator<_Cp, _IsConst>; + using difference_type = typename _In::difference_type; + using __storage_type = typename _In::__storage_type; + const int __bits_per_word = _In::__bits_per_word; difference_type __n = __last - __first; if (__n > 0) @@ -666,17 +534,37 @@ __bit_iterator<_Cp, false> __copy_backward_unaligned( { *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_); } - __result.__ctz_ = static_cast(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word); - __dn -= __ddn; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type + __result.__ctz_ = + static_cast((((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word).__data); + _CCCL_DIAG_POP + __dn -= __ddn.__data; } if (__dn > 0) { // __result.__ctz_ == 0 +#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900 + // workaround for GCC pre-9 being really bad at tracking one-past-the-end pointers at constexpr + // can't check for is-constant-evaluated, because GCC pre-9 also lacks _that_. + if (__result.__seg_ == __first.__seg_ + 1) + { + __result.__seg_ = __first.__seg_; + } + else + { + --__result.__seg_; + } +#else // ^^ GCC < 9 ^^ | vv !GCC || GCC >= 9 vv --__result.__seg_; +#endif // !GCC || GCC >= 9 + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type __result.__ctz_ = static_cast(-__dn & (__bits_per_word - 1)); - __m = ~__storage_type(0) << __result.__ctz_; + _CCCL_DIAG_POP + __m = ~__storage_type(0) << __result.__ctz_; *__result.__seg_ &= ~__m; - __last.__ctz_ -= __dn + __ddn; + __last.__ctz_ -= (__dn + __ddn).__data; *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_); } // __last.__ctz_ = 0 @@ -704,8 +592,12 @@ __bit_iterator<_Cp, false> __copy_backward_unaligned( __m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r); *__result.__seg_ &= ~__m; *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_); - __result.__ctz_ = static_cast(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word); - __n -= __dn; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type + __result.__ctz_ = + static_cast((((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word).__data); + _CCCL_DIAG_POP + __n -= __dn.__data; if (__n > 0) { // __result.__ctz_ == 0 @@ -721,20 +613,22 @@ __bit_iterator<_Cp, false> __copy_backward_unaligned( } template -inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> copy_backward( - __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> +copy_backward(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { if (__last.__ctz_ == __result.__ctz_) { - return __copy_backward_aligned(__first, __last, __result); + return _CUDA_VSTD::__copy_backward_aligned(__first, __last, __result); } - return __copy_backward_unaligned(__first, __last, __result); + return _CUDA_VSTD::__copy_backward_unaligned(__first, __last, __result); } // move template -inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { return _CUDA_VSTD::copy(__first, __last, __result); @@ -743,7 +637,7 @@ move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last // move_backward template -inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward( +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { return _CUDA_VSTD::copy_backward(__first, __last, __result); @@ -751,13 +645,14 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward( // swap_ranges -template -__bit_iterator<__C2, false> __swap_ranges_aligned( - __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result) +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> __swap_ranges_aligned( + __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) { - typedef __bit_iterator<__C1, false> _I1; - typedef typename _I1::difference_type difference_type; - typedef typename _I1::__storage_type __storage_type; + using _I1 = __bit_iterator<_Cl, false>; + using difference_type = typename _I1::difference_type; + using __storage_type = typename _I1::__storage_type; + const int __bits_per_word = _I1::__bits_per_word; difference_type __n = __last - __first; if (__n > 0) @@ -802,13 +697,14 @@ __bit_iterator<__C2, false> __swap_ranges_aligned( return __result; } -template -__bit_iterator<__C2, false> __swap_ranges_unaligned( - __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result) +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> __swap_ranges_unaligned( + __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) { - typedef __bit_iterator<__C1, false> _I1; - typedef typename _I1::difference_type difference_type; - typedef typename _I1::__storage_type __storage_type; + using _I1 = __bit_iterator<_Cl, false>; + using difference_type = typename _I1::difference_type; + using __storage_type = typename _I1::__storage_type; + const int __bits_per_word = _I1::__bits_per_word; difference_type __n = __last - __first; if (__n > 0) @@ -901,15 +797,15 @@ __bit_iterator<__C2, false> __swap_ranges_unaligned( return __result; } -template -inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<__C2, false> swap_ranges( - __bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __last1, __bit_iterator<__C2, false> __first2) +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> swap_ranges( + __bit_iterator<_Cl, false> __first1, __bit_iterator<_Cl, false> __last1, __bit_iterator<_Cr, false> __first2) { if (__first1.__ctz_ == __first2.__ctz_) { - return __swap_ranges_aligned(__first1, __last1, __first2); + return _CUDA_VSTD::__swap_ranges_aligned(__first1, __last1, __first2); } - return __swap_ranges_unaligned(__first1, __last1, __first2); + return _CUDA_VSTD::__swap_ranges_unaligned(__first1, __last1, __first2); } // rotate @@ -917,28 +813,38 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<__C2, false> swap_ranges( template struct __bit_array { - typedef typename _Cp::difference_type difference_type; - typedef typename _Cp::__storage_type __storage_type; - typedef typename _Cp::__storage_pointer __storage_pointer; - typedef typename _Cp::iterator iterator; + using difference_type = typename _Cp::difference_type; + using __storage_type = typename _Cp::__storage_type; + using __storage_pointer = typename _Cp::__storage_pointer; + using iterator = typename _Cp::iterator; + static const unsigned __bits_per_word = _Cp::__bits_per_word; static const unsigned _Np = 4; difference_type __size_; __storage_type __word_[_Np]; - _LIBCUDACXX_INLINE_VISIBILITY static difference_type capacity() + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static difference_type capacity() { return static_cast(_Np * __bits_per_word); } - _LIBCUDACXX_INLINE_VISIBILITY explicit __bit_array(difference_type __s) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY + _CCCL_CONSTEXPR_CXX14 explicit __bit_array(difference_type __s) : __size_(__s) - {} - _LIBCUDACXX_INLINE_VISIBILITY iterator begin() + { + if (__libcpp_is_constant_evaluated()) + { + for (size_t __i = 0; __i != __bit_array<_Cp>::_Np; ++__i) + { + _CUDA_VSTD::__construct_at(__word_ + __i, 0); + } + } + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator begin() { return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0); } - _LIBCUDACXX_INLINE_VISIBILITY iterator end() + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator end() { return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word, static_cast(__size_ % __bits_per_word)); @@ -946,11 +852,12 @@ struct __bit_array }; template -__bit_iterator<_Cp, false> +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, __bit_iterator<_Cp, false> __last) { - typedef __bit_iterator<_Cp, false> _I1; - typedef typename _I1::difference_type difference_type; + using _I1 = __bit_iterator<_Cp, false>; + using difference_type = typename _I1::difference_type; + difference_type __d1 = __middle - __first; difference_type __d2 = __last - __middle; _I1 __r = __first + __d2; @@ -997,14 +904,15 @@ rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, // equal template -bool __equal_unaligned( +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY bool __equal_unaligned( __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) { - typedef __bit_iterator<_Cp, _IC1> _It; - typedef typename _It::difference_type difference_type; - typedef typename _It::__storage_type __storage_type; - static const int __bits_per_word = _It::__bits_per_word; - difference_type __n = __last1 - __first1; + using _It = __bit_iterator<_Cp, _IC1>; + using difference_type = typename _It::difference_type; + using __storage_type = typename _It::__storage_type; + + const int __bits_per_word = _It::__bits_per_word; + difference_type __n = __last1 - __first1; if (__n > 0) { // do first word @@ -1032,9 +940,9 @@ bool __equal_unaligned( return false; } } - __first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word; - __first2.__ctz_ = static_cast((__ddn + __first2.__ctz_) % __bits_per_word); - __dn -= __ddn; + __first2.__seg_ += ((__ddn + __first2.__ctz_) / __bits_per_word).__data; + __first2.__ctz_ = static_cast(((__ddn + __first2.__ctz_) % __bits_per_word).__data); + __dn -= __ddn.__data; if (__dn > 0) { __m = ~__storage_type(0) >> (__bits_per_word - __dn); @@ -1075,9 +983,9 @@ bool __equal_unaligned( { return false; } - __first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word; - __first2.__ctz_ = static_cast((__dn + __first2.__ctz_) % __bits_per_word); - __n -= __dn; + __first2.__seg_ += ((__dn + __first2.__ctz_) / __bits_per_word).__data; + __first2.__ctz_ = static_cast(((__dn + __first2.__ctz_) % __bits_per_word).__data); + __n -= __dn.__data; if (__n > 0) { __m = ~__storage_type(0) >> (__bits_per_word - __n); @@ -1092,14 +1000,15 @@ bool __equal_unaligned( } template -bool __equal_aligned( +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY bool __equal_aligned( __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) { - typedef __bit_iterator<_Cp, _IC1> _It; - typedef typename _It::difference_type difference_type; - typedef typename _It::__storage_type __storage_type; - static const int __bits_per_word = _It::__bits_per_word; - difference_type __n = __last1 - __first1; + using _It = __bit_iterator<_Cp, _IC1>; + using difference_type = typename _It::difference_type; + using __storage_type = typename _It::__storage_type; + + const int __bits_per_word = _It::__bits_per_word; + difference_type __n = __last1 - __first1; if (__n > 0) { // do first word @@ -1142,56 +1051,57 @@ bool __equal_aligned( } template -inline _LIBCUDACXX_INLINE_VISIBILITY bool +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) { if (__first1.__ctz_ == __first2.__ctz_) { - return __equal_aligned(__first1, __last1, __first2); + return _CUDA_VSTD::__equal_aligned(__first1, __last1, __first2); } - return __equal_unaligned(__first1, __last1, __first2); + return _CUDA_VSTD::__equal_unaligned(__first1, __last1, __first2); } -template +template class __bit_iterator { public: - typedef typename _Cp::difference_type difference_type; - typedef bool value_type; - typedef __bit_iterator pointer; - typedef typename conditional<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp>>::type reference; - typedef random_access_iterator_tag iterator_category; + using difference_type = typename _Cp::difference_type; + using value_type = bool; + using pointer = __bit_iterator; + using reference = __conditional_t<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp>>; + using iterator_category = random_access_iterator_tag; private: - typedef typename _Cp::__storage_type __storage_type; - typedef typename conditional<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>::type - __storage_pointer; + using __storage_type = typename _Cp::__storage_type; + using __storage_pointer = + __conditional_t<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>; + static const unsigned __bits_per_word = _Cp::__bits_per_word; __storage_pointer __seg_; unsigned __ctz_; public: - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator() noexcept -#if _CCCL_STD_VER > 2011 + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator() noexcept : __seg_(nullptr) , __ctz_(0) -#endif {} - // avoid re-declaring a copy constructor for the non-const version. - using __type_for_copy_to_const = _If<_IsConst, __bit_iterator<_Cp, false>, struct __private_nat>; - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(const __type_for_copy_to_const& __it) noexcept + _CCCL_CONSTEXPR_CXX14 __bit_iterator(const __bit_iterator<_Cp, _IsConst>& __it) = default; + + template > + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 + __bit_iterator(const __bit_iterator<_Cp, _OtherIsConst>& __it) noexcept : __seg_(__it.__seg_) , __ctz_(__it.__ctz_) {} - _LIBCUDACXX_INLINE_VISIBILITY reference operator*() const noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept { return reference(__seg_, __storage_type(1) << __ctz_); } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator++() + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator++() { if (__ctz_ != __bits_per_word - 1) { @@ -1205,14 +1115,14 @@ class __bit_iterator return *this; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator++(int) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator operator++(int) { __bit_iterator __tmp = *this; ++(*this); return __tmp; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator--() + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator--() { if (__ctz_ != 0) { @@ -1226,14 +1136,15 @@ class __bit_iterator return *this; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator--(int) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator operator--(int) { __bit_iterator __tmp = *this; --(*this); return __tmp; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator+=(difference_type __n) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& + operator+=(difference_type __n) { if (__n >= 0) { @@ -1249,72 +1160,91 @@ class __bit_iterator return *this; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator-=(difference_type __n) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& + operator-=(difference_type __n) { return *this += -__n; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator+(difference_type __n) const + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator + operator+(difference_type __n) const { __bit_iterator __t(*this); __t += __n; return __t; } - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator-(difference_type __n) const + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator + operator-(difference_type __n) const { __bit_iterator __t(*this); __t -= __n; return __t; } - _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator operator+(difference_type __n, const __bit_iterator& __it) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend __bit_iterator + operator+(difference_type __n, const __bit_iterator& __it) { return __it + __n; } - _LIBCUDACXX_INLINE_VISIBILITY friend difference_type operator-(const __bit_iterator& __x, const __bit_iterator& __y) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend difference_type + operator-(const __bit_iterator& __x, const __bit_iterator& __y) { +#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 800 && _GNUC_VER < 900 + if (__y.__seg_ && __y.__seg_ != __x.__seg_) + { + return (__x.__seg_ == __y.__seg_ + 1 ? 1 : __x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_; + } +#endif // GCC [8, 9) return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_; } - _LIBCUDACXX_INLINE_VISIBILITY reference operator[](difference_type __n) const + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference + operator[](difference_type __n) const { return *(*this + __n); } - _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const __bit_iterator& __x, const __bit_iterator& __y) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool + operator==(const __bit_iterator& __x, const __bit_iterator& __y) { return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_; } - _LIBCUDACXX_INLINE_VISIBILITY friend bool operator!=(const __bit_iterator& __x, const __bit_iterator& __y) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool + operator!=(const __bit_iterator& __x, const __bit_iterator& __y) { return !(__x == __y); } - _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<(const __bit_iterator& __x, const __bit_iterator& __y) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool + operator<(const __bit_iterator& __x, const __bit_iterator& __y) { return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_); } - _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>(const __bit_iterator& __x, const __bit_iterator& __y) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool + operator>(const __bit_iterator& __x, const __bit_iterator& __y) { return __y < __x; } - _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<=(const __bit_iterator& __x, const __bit_iterator& __y) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool + operator<=(const __bit_iterator& __x, const __bit_iterator& __y) { return !(__y < __x); } - _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>=(const __bit_iterator& __x, const __bit_iterator& __y) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool + operator>=(const __bit_iterator& __x, const __bit_iterator& __y) { return !(__x < __y); } private: - _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY + _CCCL_CONSTEXPR_CXX14 explicit __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept : __seg_(__s) , __ctz_(__ctz) {} @@ -1326,56 +1256,55 @@ class __bit_iterator friend class __bit_iterator<_Cp, true>; template friend struct __bit_array; - template - friend void __fill_n_false(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); - template - friend void __fill_n_true(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); + template + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend void + __fill_n_impl(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); + template - friend __bit_iterator<_Dp, false> __copy_aligned( + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_aligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - friend __bit_iterator<_Dp, false> __copy_unaligned( + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_unaligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - friend __bit_iterator<_Dp, false> + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - friend __bit_iterator<_Dp, false> __copy_backward_aligned( + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_backward_aligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - friend __bit_iterator<_Dp, false> __copy_backward_unaligned( + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_backward_unaligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - friend __bit_iterator<_Dp, false> + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); - template - friend __bit_iterator<__C2, false> - __swap_ranges_aligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>); - template - friend __bit_iterator<__C2, false> - __swap_ranges_unaligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>); - template - friend __bit_iterator<__C2, false> - swap_ranges(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>); + template + _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false> + __swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); + template + _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false> + __swap_ranges_unaligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); + template + _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false> + swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); template - friend __bit_iterator<_Dp, false> + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>); template - friend bool __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool + __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); template - friend bool __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool + __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); template - friend bool equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); - template - friend __bit_iterator<_Dp, _IC> __find_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); - template - friend __bit_iterator<_Dp, _IC> __find_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); - template - friend typename __bit_iterator<_Dp, _IC>::difference_type - __count_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); - template - friend typename __bit_iterator<_Dp, _IC>::difference_type - __count_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool + equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); + template + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, _IC> + __find_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); + template + _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend + typename __bit_iterator<_Dp, _IC>::difference_type __count_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); }; _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__cccl/compiler.h b/libcudacxx/include/cuda/std/__cccl/compiler.h index d4d68a86f2..c3b09f1872 100644 --- a/libcudacxx/include/cuda/std/__cccl/compiler.h +++ b/libcudacxx/include/cuda/std/__cccl/compiler.h @@ -78,6 +78,9 @@ #if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1103000) # define _CCCL_CUDACC_BELOW_11_3 #endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1103000 +#if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1104000) +# define _CCCL_CUDACC_BELOW_11_4 +#endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1104000 #if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1108000) # define _CCCL_CUDACC_BELOW_11_8 #endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1108000 diff --git a/libcudacxx/include/cuda/std/__cccl/diagnostic.h b/libcudacxx/include/cuda/std/__cccl/diagnostic.h index 3ffdefd173..64f27049fe 100644 --- a/libcudacxx/include/cuda/std/__cccl/diagnostic.h +++ b/libcudacxx/include/cuda/std/__cccl/diagnostic.h @@ -32,13 +32,23 @@ # define _CCCL_DIAG_SUPPRESS_GCC(str) # define _CCCL_DIAG_SUPPRESS_NVHPC(str) # define _CCCL_DIAG_SUPPRESS_MSVC(str) -#elif defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_ICC) +# define _CCCL_DIAG_SUPPRESS_ICC(str) +#elif defined(_CCCL_COMPILER_GCC) +# define _CCCL_DIAG_PUSH _Pragma("GCC diagnostic push") +# define _CCCL_DIAG_POP _Pragma("GCC diagnostic pop") +# define _CCCL_DIAG_SUPPRESS_CLANG(str) +# define _CCCL_DIAG_SUPPRESS_GCC(str) _Pragma(_CCCL_TOSTRING(GCC diagnostic ignored str)) +# define _CCCL_DIAG_SUPPRESS_NVHPC(str) +# define _CCCL_DIAG_SUPPRESS_MSVC(str) +# define _CCCL_DIAG_SUPPRESS_ICC(str) +#elif defined(_CCCL_COMPILER_ICC) # define _CCCL_DIAG_PUSH _Pragma("GCC diagnostic push") # define _CCCL_DIAG_POP _Pragma("GCC diagnostic pop") # define _CCCL_DIAG_SUPPRESS_CLANG(str) # define _CCCL_DIAG_SUPPRESS_GCC(str) _Pragma(_CCCL_TOSTRING(GCC diagnostic ignored str)) # define _CCCL_DIAG_SUPPRESS_NVHPC(str) # define _CCCL_DIAG_SUPPRESS_MSVC(str) +# define _CCCL_DIAG_SUPPRESS_ICC(str) _Pragma(_CCCL_TOSTRING(warning disable str)) #elif defined(_CCCL_COMPILER_NVHPC) # define _CCCL_DIAG_PUSH _Pragma("diagnostic push") # define _CCCL_DIAG_POP _Pragma("diagnostic pop") @@ -46,6 +56,7 @@ # define _CCCL_DIAG_SUPPRESS_GCC(str) # define _CCCL_DIAG_SUPPRESS_NVHPC(str) _Pragma(_CCCL_TOSTRING(diag_suppress str)) # define _CCCL_DIAG_SUPPRESS_MSVC(str) +# define _CCCL_DIAG_SUPPRESS_ICC(str) #elif defined(_CCCL_COMPILER_MSVC) # define _CCCL_DIAG_PUSH __pragma(warning(push)) # define _CCCL_DIAG_POP __pragma(warning(pop)) @@ -53,6 +64,7 @@ # define _CCCL_DIAG_SUPPRESS_GCC(str) # define _CCCL_DIAG_SUPPRESS_NVHPC(str) # define _CCCL_DIAG_SUPPRESS_MSVC(str) __pragma(warning(disable : str)) +# define _CCCL_DIAG_SUPPRESS_ICC(str) #else # define _CCCL_DIAG_PUSH # define _CCCL_DIAG_POP @@ -60,6 +72,7 @@ # define _CCCL_DIAG_SUPPRESS_GCC(str) # define _CCCL_DIAG_SUPPRESS_NVHPC(str) # define _CCCL_DIAG_SUPPRESS_MSVC(str) +# define _CCCL_DIAG_SUPPRESS_ICC(str) #endif // Convenient shortcuts to silence common warnings @@ -94,7 +107,10 @@ # if defined(_CCCL_COMPILER_MSVC) # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) __pragma(_CCCL_TOSTRING(nv_diag_suppress _WARNING)) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) __pragma(_CCCL_TOSTRING(nv_diag_default _WARNING)) -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +# elif defined(_CCCL_COMPILER_ICC) // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER_ICCvvv +# define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _Pragma(_CCCL_TOSTRING(nv_diag_suppress _WARNING)) +# define _CCCL_NV_DIAG_DEFAULT(_WARNING) _Pragma(_CCCL_TOSTRING(nv_diag_default _WARNING)) +# else // ^^^ _CCCL_COMPILER_ICC^^^ / vvv !_CCCL_COMPILER_{MSVC,ICC} vvv # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) \ _Pragma(_CCCL_TOSTRING(nv_diagnostic push)) _Pragma(_CCCL_TOSTRING(nv_diag_suppress _WARNING)) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) _Pragma(_CCCL_TOSTRING(nv_diagnostic pop)) diff --git a/libcudacxx/include/cuda/std/bitset b/libcudacxx/include/cuda/std/bitset new file mode 100644 index 0000000000..78cd67857a --- /dev/null +++ b/libcudacxx/include/cuda/std/bitset @@ -0,0 +1,1071 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD_BITSET +#define _CUDA_STD_BITSET + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // all public C++ headers provide the assertion handler +#include +#include +#if defined(_LIBCUDACXX_HAS_STRING_VIEW) +# include +#endif // _LIBCUDACXX_HAS_STRING_VIEW +#include + +// standard-mandated includes + +// [bitset.syn] +#include +#if defined(_LIBCUDACXX_HAS_STRING) +# include +#endif // _LIBCUDACXX_HAS_STRING + +_CCCL_PUSH_MACROS + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +struct __avoid_promotions +{ + using __base = __conditional_t<(sizeof(_Int) >= sizeof(int)), + _Int, + __conditional_t::value, unsigned int, signed int>>; + + constexpr __avoid_promotions() = default; + + template > + _CCCL_HOST_DEVICE constexpr __avoid_promotions(_Tp __i) + : __data(static_cast<_Int>(__i)) + {} + + _CCCL_HOST_DEVICE constexpr explicit operator bool() const + { + return static_cast(__data); + } + + // helper for fill_n + _CCCL_HOST_DEVICE constexpr friend _Int __convert_to_integral(__avoid_promotions __self) + { + return __self.__data; + } + +#define _DEFINE_UNARY(__op) \ + _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op(__avoid_promotions __operand) \ + { \ + return __avoid_promotions(static_cast<_Int>(__op static_cast<__base>(__operand.__data))); \ + } + + _DEFINE_UNARY(~) + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to an unsigned type + _DEFINE_UNARY(-) + _CCCL_DIAG_POP +#undef _DEFINE_UNARY + +#define _DEFINE_SHIFT(__op) \ + template \ + _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op(__avoid_promotions __operand, _Tp __n) \ + { \ + return __avoid_promotions(static_cast<_Int>(static_cast<__base>(__operand.__data) __op static_cast<__base>(__n))); \ + } \ + template \ + _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op( \ + __avoid_promotions __operand, __avoid_promotions<_Tp> __n) \ + { \ + return __avoid_promotions( \ + static_cast<_Int>(static_cast<__base>(__operand.__data) __op static_cast<__base>(__n.__data))); \ + } + + _DEFINE_SHIFT(<<) + _DEFINE_SHIFT(>>) +#undef _DEFINE_SHIFT + +#define _DEFINE_SHIFT_ASSIGNMENT(__op) \ + template \ + _CCCL_HOST_DEVICE _CCCL_CONSTEXPR_CXX14 __avoid_promotions& operator __op##=(_Tp __n) \ + { \ + if (__n >= sizeof(_Int) * CHAR_BIT) \ + { \ + __data = 0; \ + } \ + else \ + { \ + __data = static_cast<_Int>(static_cast<__base>(__data) __op static_cast<__base>(__n)); \ + } \ + return *this; \ + } + + _DEFINE_SHIFT_ASSIGNMENT(<<) + _DEFINE_SHIFT_ASSIGNMENT(>>) +#undef _DEFINE_SHIFT_ASSIGNMENT + +#define _DEFINE_BINARY(__op) \ + _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op( \ + __avoid_promotions __lhs, __avoid_promotions __rhs) \ + { \ + return __avoid_promotions( \ + static_cast<_Int>(static_cast<__base>(__lhs.__data) __op static_cast<__base>(__rhs.__data))); \ + } + + _DEFINE_BINARY(+) + _DEFINE_BINARY(-) + _DEFINE_BINARY(*) + _DEFINE_BINARY(/) + _DEFINE_BINARY(%) + _DEFINE_BINARY(&) + _DEFINE_BINARY(|) + _DEFINE_BINARY(^) +#undef _DEFINE_BINARY + +#define _DEFINE_ASSIGNMENT(__op) \ + _CCCL_HOST_DEVICE _CCCL_CONSTEXPR_CXX14 __avoid_promotions& operator __op##=(__avoid_promotions __rhs) \ + { \ + __data = static_cast<_Int>(static_cast<__base>(__data) __op static_cast<__base>(__rhs.__data)); \ + return *this; \ + } + + _DEFINE_ASSIGNMENT(+) + _DEFINE_ASSIGNMENT(-) + _DEFINE_ASSIGNMENT(*) + _DEFINE_ASSIGNMENT(/) + _DEFINE_ASSIGNMENT(%) + _DEFINE_ASSIGNMENT(&) + _DEFINE_ASSIGNMENT(|) + _DEFINE_ASSIGNMENT(^) +#undef _DEFINE_ASSIGNMENT + +#define _DEFINE_COMPARISON(__op) \ + _CCCL_HOST_DEVICE constexpr friend bool operator __op(__avoid_promotions __lhs, __avoid_promotions __rhs) \ + { \ + return static_cast<_Int>(static_cast<__base>(__lhs.__data) __op static_cast<__base>(__rhs.__data)); \ + } + + _DEFINE_COMPARISON(<) + _DEFINE_COMPARISON(>) + _DEFINE_COMPARISON(==) +#if _CCCL_STD_VER <= 2017 + _DEFINE_COMPARISON(!=) +#endif +#undef _DEFINE_COMPARISON + + _Int __data; +}; + +static_assert(sizeof(__avoid_promotions) == sizeof(uint_least8_t), ""); +static_assert(sizeof(__avoid_promotions) == sizeof(uint_least16_t), ""); +static_assert(sizeof(__avoid_promotions) == sizeof(uint_least32_t), ""); + +template +class __bitset +{ +public: + typedef ptrdiff_t difference_type; + typedef size_t size_type; + typedef __avoid_promotions __storage_type; + +protected: + typedef __bitset __self; + typedef __storage_type* __storage_pointer; + typedef const __storage_type* __const_storage_pointer; + static const unsigned __bits_per_word = static_cast(sizeof(__storage_type) * CHAR_BIT); + + friend class __bit_reference<__bitset>; + friend class __bit_const_reference<__bitset>; + friend class __bit_iterator<__bitset, false>; + friend class __bit_iterator<__bitset, true>; + friend struct __bit_array<__bitset>; + + __storage_type __first_[_N_words]; + + typedef __bit_reference<__bitset> reference; + typedef __bit_const_reference<__bitset> const_reference; + typedef __bit_iterator<__bitset, false> iterator; + typedef __bit_iterator<__bitset, true> const_iterator; + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr __storage_type + __clip_top_word_to_size(unsigned long long __v) + { + return _Size >= 2 * __bits_per_word + ? static_cast<__storage_type>(__v >> __bits_per_word) + : static_cast<__storage_type>( + (__v >> __bits_per_word) & ((__storage_type(1) << (_Size - __bits_per_word)) - 1)); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept + : __first_{0} + {} + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept + : __first_{static_cast<__storage_type>(__v), __clip_top_word_to_size(__v)} + {} + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference + __make_ref(size_t __pos) noexcept + { + return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference + __make_ref(size_t __pos) const noexcept + { + return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator + __make_iter(size_t __pos) noexcept + { + return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator + __make_iter(size_t __pos) const noexcept + { + return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + operator&=(const __bitset& __v) noexcept + { + for (size_type __i = 0; __i < _N_words; ++__i) + { + __first_[__i] &= __v.__first_[__i]; + } + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + operator|=(const __bitset& __v) noexcept + { + for (size_type __i = 0; __i < _N_words; ++__i) + { + __first_[__i] |= __v.__first_[__i]; + } + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + operator^=(const __bitset& __v) noexcept + { + for (size_type __i = 0; __i < _N_words; ++__i) + { + __first_[__i] ^= __v.__first_[__i]; + } + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bitset& + operator<<=(size_t __pos) noexcept + { + __pos = _CUDA_VSTD::min(__pos, _Size); + _CUDA_VSTD::copy_backward(__make_iter(0), __make_iter(_Size - __pos), __make_iter(_Size)); + _CUDA_VSTD::fill_n(__make_iter(0), __pos, false); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bitset& + operator>>=(size_t __pos) noexcept + { + __pos = _CUDA_VSTD::min(__pos, _Size); + _CUDA_VSTD::copy(__make_iter(__pos), __make_iter(_Size), __make_iter(0)); + _CUDA_VSTD::fill_n(__make_iter(_Size - __pos), __pos, false); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept + { + // do middle whole words + size_type __n = _Size; + __storage_pointer __p = __first_; + for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word) + { + *__p = ~*__p; + } + // do last partial word + if (__n > 0) + { + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); + __storage_type __b = *__p & __m; + *__p &= ~__m; + *__p |= ~__b & __m; + } + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const + { + return to_ulong(integral_constant()); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const + { + return to_ullong(integral_constant()); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept + { + // do middle whole words + size_type __n = _Size; + __const_storage_pointer __p = __first_; + for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word) + { + if (~*__p) + { + return false; + } + } + // do last partial word + if (__n > 0) + { + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); + if (~*__p & __m) + { + return false; + } + } + return true; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept + { + // do middle whole words + size_type __n = _Size; + __const_storage_pointer __p = __first_; + for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word) + { + if (*__p) + { + return true; + } + } + // do last partial word + if (__n > 0) + { + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); + if (*__p & __m) + { + return true; + } + } + return false; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept + { + size_t __h = 0; + for (size_type __i = 0; __i < _N_words; ++__i) + { + __h ^= __first_[__i]; + } + return __h; + } + +private: + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(false_type) const + { + const_iterator __e = __make_iter(_Size); + const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true); + if (__i != __e) + { + _CUDA_VSTD::__throw_overflow_error("bitset to_ulong overflow error"); + } + + return to_ulong(true_type()); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(true_type) const + { + return to_ulong(true_type(), integral_constant()); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long + to_ulong(true_type, false_type) const + { + return __first_[0].__data; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long + to_ulong(true_type, true_type) const + { + unsigned long __r = __first_[0].__data; + for (size_t __i = 1; __i < sizeof(unsigned long) / sizeof(__storage_type); ++__i) + { + __r |= static_cast(__first_[__i].__data) << (__i * sizeof(__storage_type) * CHAR_BIT); + } + return __r; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long + to_ullong(false_type) const + { + const_iterator __e = __make_iter(_Size); + const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true); + if (__i != __e) + { + _CUDA_VSTD::__throw_overflow_error("bitset to_ullong overflow error"); + } + + return to_ullong(true_type()); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long + to_ullong(true_type) const + { + return to_ullong(true_type(), integral_constant()); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long + to_ullong(true_type, false_type) const + { + return __first_[0].__data; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long + to_ullong(true_type, true_type) const + { + unsigned long long __r = __first_[0].__data; + for (size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i) + { + __r |= static_cast(__first_[__i].__data) << (__i * sizeof(__storage_type) * CHAR_BIT); + } + return __r; + } +}; + +template +class __bitset<1, _Size> +{ +public: + typedef ptrdiff_t difference_type; + typedef size_t size_type; + typedef __avoid_promotions<__conditional_t<_Size <= 8, uint8_t, __conditional_t<_Size <= 16, uint16_t, uint32_t>>> + __storage_type; + +protected: + typedef __bitset __self; + typedef __storage_type* __storage_pointer; + typedef const __storage_type* __const_storage_pointer; + static const unsigned __bits_per_word = static_cast(sizeof(__storage_type) * CHAR_BIT); + + friend class __bit_reference<__bitset>; + friend class __bit_const_reference<__bitset>; + friend class __bit_iterator<__bitset, false>; + friend class __bit_iterator<__bitset, true>; + friend struct __bit_array<__bitset>; + + __storage_type __first_; + + typedef __bit_reference<__bitset> reference; + typedef __bit_const_reference<__bitset> const_reference; + typedef __bit_iterator<__bitset, false> iterator; + typedef __bit_iterator<__bitset, true> const_iterator; + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept + : __first_(0) + {} + + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4293) // shift count negative or too big + // MSVC is slightly overeager with diagnosing that here + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept + : __first_(_Size == __bits_per_word + ? static_cast<__storage_type>(__v) + : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - __storage_type(1))) + {} + _CCCL_DIAG_POP + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference + __make_ref(size_t __pos) noexcept + { + return reference(&__first_, __storage_type(1) << __pos); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference + __make_ref(size_t __pos) const noexcept + { + return const_reference(&__first_, __storage_type(1) << __pos); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator + __make_iter(size_t __pos) noexcept + { + return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator + __make_iter(size_t __pos) const noexcept + { + return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + operator&=(const __bitset& __v) noexcept + { + __first_ &= __v.__first_; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + operator|=(const __bitset& __v) noexcept + { + __first_ |= __v.__first_; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + operator^=(const __bitset& __v) noexcept + { + __first_ ^= __v.__first_; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t __rhs) noexcept + { + __first_ <<= __rhs; + __first_ &= ~__storage_type(0) >> (__bits_per_word - _Size); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t __rhs) noexcept + { + __first_ >>= __rhs; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept + { + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); + __first_ = ~__first_; + __first_ &= __m; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const + { +#ifdef _CCCL_COMPILER_MSVC + if (static_cast(__first_.__data) != __first_.__data) + { + _CUDA_VSTD::__throw_overflow_error("bitset to_ulong overflow error"); + } + return static_cast(__first_.__data); +#else // ^^ MSVC ^^ | vv !MSVC vv + return __first_.__data; +#endif // !MSVC + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const + { + return __first_.__data; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept + { + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); + return !static_cast(~__first_ & __m); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept + { + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); + return static_cast(__first_ & __m); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept + { + return __first_; + } +}; + +template <> +class __bitset<0, 0> +{ +public: + typedef ptrdiff_t difference_type; + typedef size_t size_type; + typedef __avoid_promotions __storage_type; + +protected: + typedef __bitset __self; + typedef __storage_type* __storage_pointer; + typedef const __storage_type* __const_storage_pointer; + static const unsigned __bits_per_word = static_cast(sizeof(__storage_type) * CHAR_BIT); + + friend class __bit_reference<__bitset>; + friend class __bit_const_reference<__bitset>; + friend class __bit_iterator<__bitset, false>; + friend class __bit_iterator<__bitset, true>; + friend struct __bit_array<__bitset>; + + typedef __bit_reference<__bitset> reference; + typedef __bit_const_reference<__bitset> const_reference; + typedef __bit_iterator<__bitset, false> iterator; + typedef __bit_iterator<__bitset, true> const_iterator; + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept {} + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long) noexcept {} + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference __make_ref(size_t) noexcept + { + return reference(nullptr, 1); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept + { + return const_reference(nullptr, 1); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator __make_iter(size_t) noexcept + { + return iterator(nullptr, 0); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator + __make_iter(size_t) const noexcept + { + return const_iterator(nullptr, 0); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator&=(const __bitset&) noexcept + {} + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator|=(const __bitset&) noexcept + {} + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator^=(const __bitset&) noexcept + {} + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t) noexcept {} + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t) noexcept {} + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept {} + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const + { + return 0; + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const + { + return 0; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept + { + return true; + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept + { + return false; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept + { + return 0; + } +}; + +template +class _LIBCUDACXX_TEMPLATE_VIS bitset; +template +struct hash>; + +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void +__throw_if_out_of_range(size_t __pos, const char* __msg) +{ + if (__pos >= _Size) + { + _CUDA_VSTD::__throw_out_of_range(__msg); + } +} + +template <> +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void +__throw_if_out_of_range<0>(size_t __pos, const char* __msg) +{ + _CUDA_VSTD::__throw_out_of_range(__msg); +} + +template +class _LIBCUDACXX_TEMPLATE_VIS bitset : private __bitset<_Size == 0 ? 0 : (_Size - 1) / 32 + 1, _Size> +{ +public: + static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / 32 + 1; + typedef __bitset<__n_words, _Size> base; + +public: + typedef typename base::reference reference; + typedef typename base::const_reference const_reference; + + // 23.3.5.1 constructors: + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {} + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset(unsigned long long __v) noexcept + : base(__v) + {} + template ::value>> + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit bitset( + const _CharT* __str, size_t __n = static_cast(-1), _CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) + { + size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str)); +#if defined(_LIBCUDACXX_HAS_STRING_VIEW) + __init_from_string_view(basic_string_view<_CharT>(__str, __rlen), __zero, __one); +#else + __init_from_cstr(__str, __rlen, __zero, __one); +#endif + } +#if defined(_LIBCUDACXX_HAS_STRING_VIEW) + template + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit bitset( + basic_string_view<_CharT, _Traits> __str, + typename basic_string_view<_CharT, _Traits>::size_type __pos = 0, + typename basic_string_view<_CharT, _Traits>::size_type __n = basic_string_view<_CharT, _Traits>::npos, + _CharT __zero = _CharT('0'), + _CharT __one = _CharT('1')) + { + if (__pos > __str.size()) + { + _CUDA_VSTD::__throw_out_of_range("bitset string pos out of range"); + } + + size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos); + __init_from_string_view(basic_string_view<_CharT, _Traits>(__str.data() + __pos, __rlen), __zero, __one); + } +#endif // defined(_LIBCUDACXX_HAS_STRING_VIEW) +#if defined(_LIBCUDACXX_HAS_STRING) + template + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit bitset( + const basic_string<_CharT, _Traits, _Allocator>& __str, + typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0, + typename basic_string<_CharT, _Traits, _Allocator>::size_type __n = basic_string<_CharT, _Traits, _Allocator>::npos, + _CharT __zero = _CharT('0'), + _CharT __one = _CharT('1')) + { + if (__pos > __str.size()) + { + _CUDA_VSTD::__throw_out_of_range("bitset string pos out of range"); + } + + size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos); + __init_from_string_view(basic_string_view<_CharT, _Traits>(__str.data() + __pos, __rlen), __zero, __one); + } +#endif // defined(_LIBCUDACXX_HAS_STRING) + + // 23.3.5.2 bitset operations: + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& + operator&=(const bitset& __rhs) noexcept + { + base::operator&=(__rhs); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& + operator|=(const bitset& __rhs) noexcept + { + base::operator|=(__rhs); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& + operator^=(const bitset& __rhs) noexcept + { + base::operator^=(__rhs); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& + operator<<=(size_t __rhs) noexcept + { + base::operator<<=(__rhs); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& + operator>>=(size_t __rhs) noexcept + { + base::operator>>=(__rhs); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& set() noexcept + { + _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& + set(size_t __pos, bool __val = true) + { + _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset set argument out of range"); + + (*this)[__pos] = __val; + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& reset() noexcept + { + _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& reset(size_t __pos) + { + _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset reset argument out of range"); + + (*this)[__pos] = false; + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset operator~() const noexcept + { + bitset __x(*this); + __x.flip(); + return __x; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& flip() noexcept + { + base::flip(); + return *this; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& flip(size_t __pos) + { + _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset flip argument out of range"); + + reference __r = base::__make_ref(__pos); + __r = ~__r; + return *this; + } + + // element access: +#ifdef _LIBCUDACXX_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator[](size_t __p) const + { + return base::__make_ref(__p); + } +#else + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference operator[](size_t __p) const + { + return base::__make_ref(__p); + } +#endif + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator[](size_t __p) + { + return base::__make_ref(__p); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const + { + return base::to_ulong(); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const + { + return base::to_ullong(); + } + +#if defined(_LIBCUDACXX_HAS_STRING) + template + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY + _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, _Allocator> + to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const + { + basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero); + for (size_t __i = 0; __i != _Size; ++__i) + { + if ((*this)[__i]) + { + __r[_Size - 1 - __i] = __one; + } + } + return __r; + } + + template + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY + _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, allocator<_CharT>> + to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const + { + return to_string<_CharT, _Traits, allocator<_CharT>>(__zero, __one); + } + + template + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY + _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>> + to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const + { + return to_string<_CharT, char_traits<_CharT>, allocator<_CharT>>(__zero, __one); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY + _CCCL_CONSTEXPR_CXX14 basic_string, allocator> + to_string(char __zero = '0', char __one = '1') const + { + return to_string, allocator>(__zero, __one); + } +#endif // defined(_LIBCUDACXX_HAS_STRING) + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 size_t count() const noexcept + { + return static_cast(_CUDA_VSTD::count(base::__make_iter(0), base::__make_iter(_Size), true)); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept + { + return _Size; + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool + operator==(const bitset& __rhs) const noexcept + { + return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0)); + } + +#if _CCCL_STD_VER <= 2017 + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool + operator!=(const bitset& __rhs) const noexcept + { + return !(*this == __rhs); + } +#endif // C++ <= 17 + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool test(size_t __pos) const + { + _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset test argument out of range"); + + return (*this)[__pos]; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept + { + return base::all(); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept + { + return base::any(); + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool none() const noexcept + { + return !any(); + } + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset + operator<<(size_t __pos) const noexcept + { + bitset __r = *this; + __r <<= __pos; + return __r; + } + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset + operator>>(size_t __pos) const noexcept + { + bitset __r = *this; + __r >>= __pos; + return __r; + } + +private: +#if defined(_LIBCUDACXX_HAS_STRING_VIEW) + template + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + __init_from_string_view(basic_string_view<_CharT, _Traits> __str, _CharT __zero, _CharT __one) + { + for (size_t __i = 0; __i < __str.size(); ++__i) + { + if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one)) + { + _CUDA_VSTD::__throw_invalid_argument("bitset string ctor has invalid argument"); + } + } + + size_t __mp = _CUDA_VSTD::min(__str.size(), _Size); + size_t __i = 0; + for (; __i < __mp; ++__i) + { + _CharT __c = __str[__mp - 1 - __i]; + (*this)[__i] = _Traits::eq(__c, __one); + } + _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false); + } +#else // ^^ _LIBCUDACXX_HAS_STRING_VIEW ^^ | vv !_LIBCUDACXX_HAS_STRING_VIEW vv + template > + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void + __init_from_cstr(const _CharT* __str, size_t __size, _CharT __zero, _CharT __one) + { + for (size_t __i = 0; __i < __size; ++__i) + { + if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one)) + { + _CUDA_VSTD::__throw_invalid_argument("bitset string ctor has invalid argument"); + } + } + + size_t __mp = _CUDA_VSTD::min(__size, _Size); + size_t __i = 0; + for (; __i < __mp; ++__i) + { + _CharT __c = __str[__mp - 1 - __i]; + (*this)[__i] = _Traits::eq(__c, __one); + } + _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false); + } +#endif // !_LIBCUDACXX_HAS_STRING_VIEW + + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept + { + return base::__hash_code(); + } + + friend struct hash; +}; + +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size> +operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept +{ + bitset<_Size> __r = __x; + __r &= __y; + return __r; +} + +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size> +operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept +{ + bitset<_Size> __r = __x; + __r |= __y; + return __r; +} + +template +inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size> +operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept +{ + bitset<_Size> __r = __x; + __r ^= __y; + return __r; +} + +template +struct _LIBCUDACXX_TEMPLATE_VIS hash> : public __unary_function, size_t> +{ + _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const bitset<_Size>& __bs) const noexcept + { + return __bs.__hash_code(); + } +}; + +template +_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY basic_istream<_CharT, _Traits>& +operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x); + +template +_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY basic_ostream<_CharT, _Traits>& +operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x); + +_LIBCUDACXX_END_NAMESPACE_STD + +_CCCL_POP_MACROS + +#endif // _CUDA_STD_BITSET diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__string b/libcudacxx/include/cuda/std/detail/libcxx/include/__string new file mode 100644 index 0000000000..1f0517b99d --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__string @@ -0,0 +1,1246 @@ +// -*- C++ -*- +//===-------------------------- __string ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___STRING +#define _LIBCUDACXX___STRING + +/* + string synopsis + +namespace std +{ + +template +struct char_traits +{ + typedef charT char_type; + typedef ... int_type; + typedef streamoff off_type; + typedef streampos pos_type; + typedef mbstate_t state_type; + + static constexpr void assign(char_type& c1, const char_type& c2) noexcept; + static constexpr bool eq(char_type c1, char_type c2) noexcept; + static constexpr bool lt(char_type c1, char_type c2) noexcept; + + static constexpr int compare(const char_type* s1, const char_type* s2, size_t n); + static constexpr size_t length(const char_type* s); + static constexpr const char_type* + find(const char_type* s, size_t n, const char_type& a); + static char_type* move(char_type* s1, const char_type* s2, size_t n); + static char_type* copy(char_type* s1, const char_type* s2, size_t n); + static char_type* assign(char_type* s, size_t n, char_type a); + + static constexpr int_type not_eof(int_type c) noexcept; + static constexpr char_type to_char_type(int_type c) noexcept; + static constexpr int_type to_int_type(char_type c) noexcept; + static constexpr bool eq_int_type(int_type c1, int_type c2) noexcept; + static constexpr int_type eof() noexcept; +}; + +template <> struct char_traits; +template <> struct char_traits; +template <> struct char_traits; // c++20 + +} // std + +*/ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include // all public C++ headers provide the assertion handler +#include +#include + +_CCCL_PUSH_MACROS + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// char_traits + +template +struct _LIBCUDACXX_TEMPLATE_VIS char_traits +{ + typedef _CharT char_type; + typedef int int_type; + typedef streamoff off_type; +#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H + typedef streampos pos_type; + typedef mbstate_t state_type; +#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H + + _LIBCUDACXX_INLINE_VISIBILITY static inline void _CCCL_CONSTEXPR_CXX14 + assign(char_type& __c1, const char_type& __c2) noexcept + { + __c1 = __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept + { + return __c1 < __c2; + } + + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int + compare(const char_type* __s1, const char_type* __s2, size_t __n); + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s); + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type* + find(const char_type* __s, size_t __n, const char_type& __a); + _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n); + _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n); + _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a); + +#ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept + { + return eq_int_type(__c, eof()) ? ~eof() : __c; + } +#endif // !__cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept + { + return char_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept + { + return int_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept + { + return __c1 == __c2; + } +#ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept + { + return int_type(EOF); + } +#endif // !__cuda_std__ +}; + +template +_CCCL_CONSTEXPR_CXX14 int char_traits<_CharT>::compare(const char_type* __s1, const char_type* __s2, size_t __n) +{ + for (; __n; --__n, ++__s1, ++__s2) + { + if (lt(*__s1, *__s2)) + { + return -1; + } + if (lt(*__s2, *__s1)) + { + return 1; + } + } + return 0; +} + +template +inline _CCCL_CONSTEXPR_CXX14 size_t char_traits<_CharT>::length(const char_type* __s) +{ + size_t __len = 0; + for (; !eq(*__s, char_type(0)); ++__s) + { + ++__len; + } + return __len; +} + +template +inline _CCCL_CONSTEXPR_CXX14 const _CharT* +char_traits<_CharT>::find(const char_type* __s, size_t __n, const char_type& __a) +{ + for (; __n; --__n) + { + if (eq(*__s, __a)) + { + return __s; + } + ++__s; + } + return 0; +} + +template +inline _LIBCUDACXX_INLINE_VISIBILITY _CharT* +char_traits<_CharT>::move(char_type* __s1, const char_type* __s2, size_t __n) +{ + char_type* __r = __s1; + if (__s1 < __s2) + { + for (; __n; --__n, ++__s1, ++__s2) + { + assign(*__s1, *__s2); + } + } + else if (__s2 < __s1) + { + __s1 += __n; + __s2 += __n; + for (; __n; --__n) + { + assign(*--__s1, *--__s2); + } + } + return __r; +} + +template +inline _LIBCUDACXX_INLINE_VISIBILITY _CharT* +char_traits<_CharT>::copy(char_type* __s1, const char_type* __s2, size_t __n) +{ + _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range"); + char_type* __r = __s1; + for (; __n; --__n, ++__s1, ++__s2) + { + assign(*__s1, *__s2); + } + return __r; +} + +template +inline _LIBCUDACXX_INLINE_VISIBILITY _CharT* char_traits<_CharT>::assign(char_type* __s, size_t __n, char_type __a) +{ + char_type* __r = __s; + for (; __n; --__n, ++__s) + { + assign(*__s, __a); + } + return __r; +} + +// char_traits + +// GCC's builtin_strlen isn't reliable at constexpr time +// MSVC does not expose builtin_strlen before C++17 +#if defined(_CCCL_COMPILER_GCC) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_STD_VER < 2017) +# define _CCCL_HAS_NO_BUILTIN_STRLEN +#endif + +template <> +struct _LIBCUDACXX_TEMPLATE_VIS char_traits +{ + typedef char char_type; + typedef int int_type; + typedef streamoff off_type; +#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H + typedef streampos pos_type; + typedef mbstate_t state_type; +#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H + + _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void + assign(char_type& __c1, const char_type& __c2) noexcept + { + __c1 = __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept + { + return (unsigned char) __c1 < (unsigned char) __c2; + } + + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int + compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static inline size_t _CCCL_CONSTEXPR_CXX14 length(const char_type* __s) noexcept + { +#ifdef _CCCL_HAS_NO_BUILTIN_STRLEN +# ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED // is_constant_evaluated only exists since GCC 9 + if (__libcpp_is_constant_evaluated()) +# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) + { + size_t __len = 0; + for (; !eq(*__s, char(0)); ++__s) + { + ++__len; + } + return __len; + } +#endif // defined(_CCCL_HAS_NO_BUILTIN_STRLEN) +#if !defined(_CCCL_HAS_NO_BUILTIN_STRLEN) || defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (size_t __len = 0; for (; !eq(*__s, char(0)); ++__s)++ __len; return __len;), + (return __builtin_strlen(__s);)) +#endif // !defined(_CCCL_HAS_NO_BUILTIN_STRLEN) || defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) + } + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type* + find(const char_type* __s, size_t __n, const char_type& __a) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* + move(char_type* __s1, const char_type* __s2, size_t __n) noexcept + { + return __n == 0 ? __s1 : (char_type*) __copy<_ClassicAlgPolicy>(__s2, __s2 + __n, __s1).first - __n; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* + copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept + { + _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range"); + return __n == 0 ? __s1 : (char_type*) memcpy(__s1, __s2, __n); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept + { + return __n == 0 ? __s : (char_type*) memset(__s, to_int_type(__a), __n); + } + +#ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept + { + return eq_int_type(__c, eof()) ? ~eof() : __c; + } +#endif // !__cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept + { + return char_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept + { + return int_type((unsigned char) __c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept + { + return __c1 == __c2; + } +#ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept + { + return int_type(EOF); + } +#endif // !__cuda_std__ +}; + +inline _CCCL_CONSTEXPR_CXX14 int +char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + if (__n == 0) + { + return 0; + } +#if __has_feature(cxx_constexpr_string_builtins) + return __builtin_memcmp(__s1, __s2, __n); +#else + for (; __n; --__n, ++__s1, ++__s2) + { + if (lt(*__s1, *__s2)) + { + return -1; + } + if (lt(*__s2, *__s1)) + { + return 1; + } + } + return 0; +#endif // !has_feature(constexpr_string_builtins) +} + +inline _CCCL_CONSTEXPR_CXX14 const char* +char_traits::find(const char_type* __s, size_t __n, const char_type& __a) noexcept +{ + if (__n == 0) + { + return nullptr; + } +#if __has_feature(cxx_constexpr_string_builtins) + return __builtin_char_memchr(__s, to_int_type(__a), __n); +#else + for (; __n; --__n) + { + if (eq(*__s, __a)) + { + return __s; + } + ++__s; + } + return nullptr; +#endif // !has_feature(constexpr_string_builtins) +} + +// char_traits + +#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H +template <> +struct _LIBCUDACXX_TEMPLATE_VIS char_traits +{ + typedef wchar_t char_type; + typedef wint_t int_type; + typedef streamoff off_type; + typedef streampos pos_type; + typedef mbstate_t state_type; + + _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void + assign(char_type& __c1, const char_type& __c2) noexcept + { + __c1 = __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept + { + return __c1 < __c2; + } + + static _CCCL_CONSTEXPR_CXX14 int compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept; + static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept; + static _CCCL_CONSTEXPR_CXX14 const char_type* find(const char_type* __s, size_t __n, const char_type& __a) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* + move(char_type* __s1, const char_type* __s2, size_t __n) noexcept + { + return __n == 0 ? __s1 : (char_type*) wmemmove(__s1, __s2, __n); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* + copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept + { + _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range"); + return __n == 0 ? __s1 : (char_type*) wmemcpy(__s1, __s2, __n); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept + { + return __n == 0 ? __s : (char_type*) wmemset(__s, __a, __n); + } + +# ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept + { + return eq_int_type(__c, eof()) ? ~eof() : __c; + } +# endif // !__cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept + { + return char_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept + { + return int_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept + { + return __c1 == __c2; + } +# ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept + { + return int_type(WEOF); + } +# endif // !__cuda_std__ +}; + +inline _CCCL_CONSTEXPR_CXX14 int +char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + if (__n == 0) + { + return 0; + } +# if __has_feature(cxx_constexpr_string_builtins) + return __builtin_wmemcmp(__s1, __s2, __n); +# elif _CCCL_STD_VER <= 2014 + return wmemcmp(__s1, __s2, __n); +# else + for (; __n; --__n, ++__s1, ++__s2) + { + if (lt(*__s1, *__s2)) + { + return -1; + } + if (lt(*__s2, *__s1)) + { + return 1; + } + } + return 0; +# endif +} +#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H + +template +_LIBCUDACXX_INLINE_VISIBILITY inline constexpr size_t +__char_traits_length_checked(const typename _Traits::char_type* __s) noexcept +{ +#if _LIBCUDACXX_DEBUG_LEVEL >= 1 + return __s + ? _Traits::length(__s) + : (_CUDA_VSTD::__libcpp_debug_function(_CUDA_VSTD::__libcpp_debug_info( + __FILE__, __LINE__, "p == nullptr", "null pointer pass to non-null argument of char_traits<...>::length")), + 0); +#else + return _Traits::length(__s); +#endif +} + +#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H +inline _CCCL_CONSTEXPR_CXX14 size_t char_traits::length(const char_type* __s) noexcept +{ +# if __has_feature(cxx_constexpr_string_builtins) + return __builtin_wcslen(__s); +# elif _CCCL_STD_VER <= 2014 + return wcslen(__s); +# else + size_t __len = 0; + for (; !eq(*__s, char_type(0)); ++__s) + { + ++__len; + } + return __len; +# endif +} + +inline _CCCL_CONSTEXPR_CXX14 const wchar_t* +char_traits::find(const char_type* __s, size_t __n, const char_type& __a) noexcept +{ + if (__n == 0) + { + return nullptr; + } +# if __has_feature(cxx_constexpr_string_builtins) + return __builtin_wmemchr(__s, __a, __n); +# else + for (; __n; --__n) + { + if (eq(*__s, __a)) + { + return __s; + } + ++__s; + } + return nullptr; +# endif +} +#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H + +#ifndef _LIBCUDACXX_NO_HAS_CHAR8_T + +template <> +struct _LIBCUDACXX_TEMPLATE_VIS char_traits +{ + typedef char8_t char_type; + typedef unsigned int int_type; + typedef streamoff off_type; +# ifndef _LIBCUDACXX_HAS_NO_WCHAR_H + typedef u8streampos pos_type; + typedef mbstate_t state_type; +# endif // !_LIBCUDACXX_HAS_NO_WCHAR_H + + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr void assign(char_type& __c1, const char_type& __c2) noexcept + { + __c1 = __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept + { + return __c1 < __c2; + } + + _LIBCUDACXX_INLINE_VISIBILITY static constexpr int + compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept; + + _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t length(const char_type* __s) noexcept; + + _LIBCUDACXX_INLINE_VISIBILITY static constexpr const char_type* + find(const char_type* __s, size_t __n, const char_type& __a) noexcept; + + _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept + { + return __n == 0 ? __s1 : (char_type*) __copy<_ClassicAlgPolicy>(__s2, __s2 + __n, __s1).first - __n; + } + + _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept + { + _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range"); + return __n == 0 ? __s1 : (char_type*) memcpy(__s1, __s2, __n); + } + + _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept + { + return __n == 0 ? __s : (char_type*) memset(__s, to_int_type(__a), __n); + } + +# ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept + { + return eq_int_type(__c, eof()) ? ~eof() : __c; + } +# endif // !__cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept + { + return char_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept + { + return int_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept + { + return __c1 == __c2; + } +# ifndef __cuda_std__ + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept + { + return int_type(EOF); + } +# endif // !__cuda_std__ +}; + +// TODO use '__builtin_strlen' if it ever supports char8_t ?? +inline constexpr size_t char_traits::length(const char_type* __s) noexcept +{ + size_t __len = 0; + for (; !eq(*__s, char_type(0)); ++__s) + { + ++__len; + } + return __len; +} + +inline constexpr int char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ +# if __has_feature(cxx_constexpr_string_builtins) + return __builtin_memcmp(__s1, __s2, __n); +# else + for (; __n; --__n, ++__s1, ++__s2) + { + if (lt(*__s1, *__s2)) + { + return -1; + } + if (lt(*__s2, *__s1)) + { + return 1; + } + } + return 0; +# endif +} + +// TODO use '__builtin_char_memchr' if it ever supports char8_t ?? +inline constexpr const char8_t* +char_traits::find(const char_type* __s, size_t __n, const char_type& __a) noexcept +{ + for (; __n; --__n) + { + if (eq(*__s, __a)) + { + return __s; + } + ++__s; + } + return 0; +} + +#endif // #_LIBCUDACXX_NO_HAS_CHAR8_T + +#ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS + +template <> +struct _LIBCUDACXX_TEMPLATE_VIS char_traits +{ + typedef char16_t char_type; + typedef uint_least16_t int_type; + typedef streamoff off_type; +# ifndef _LIBCUDACXX_HAS_NO_WCHAR_H + typedef u16streampos pos_type; + typedef mbstate_t state_type; +# endif // !_LIBCUDACXX_HAS_NO_WCHAR_H + + _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void + assign(char_type& __c1, const char_type& __c2) noexcept + { + __c1 = __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept + { + return __c1 < __c2; + } + + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int + compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type* + find(const char_type* __s, size_t __n, const char_type& __a) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept; + + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept + { + return eq_int_type(__c, eof()) ? ~eof() : __c; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept + { + return char_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept + { + return int_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept + { + return int_type(0xFFFF); + } +}; + +inline _CCCL_CONSTEXPR_CXX14 int +char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + for (; __n; --__n, ++__s1, ++__s2) + { + if (lt(*__s1, *__s2)) + { + return -1; + } + if (lt(*__s2, *__s1)) + { + return 1; + } + } + return 0; +} + +inline _CCCL_CONSTEXPR_CXX14 size_t char_traits::length(const char_type* __s) noexcept +{ + size_t __len = 0; + for (; !eq(*__s, char_type(0)); ++__s) + { + ++__len; + } + return __len; +} + +inline _CCCL_CONSTEXPR_CXX14 const char16_t* +char_traits::find(const char_type* __s, size_t __n, const char_type& __a) noexcept +{ + for (; __n; --__n) + { + if (eq(*__s, __a)) + { + return __s; + } + ++__s; + } + return 0; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY char16_t* +char_traits::move(char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + char_type* __r = __s1; + if (__s1 < __s2) + { + for (; __n; --__n, ++__s1, ++__s2) + { + assign(*__s1, *__s2); + } + } + else if (__s2 < __s1) + { + __s1 += __n; + __s2 += __n; + for (; __n; --__n) + { + assign(*--__s1, *--__s2); + } + } + return __r; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY char16_t* +char_traits::copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range"); + char_type* __r = __s1; + for (; __n; --__n, ++__s1, ++__s2) + { + assign(*__s1, *__s2); + } + return __r; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY char16_t* +char_traits::assign(char_type* __s, size_t __n, char_type __a) noexcept +{ + char_type* __r = __s; + for (; __n; --__n, ++__s) + { + assign(*__s, __a); + } + return __r; +} + +template <> +struct _LIBCUDACXX_TEMPLATE_VIS char_traits +{ + typedef char32_t char_type; + typedef uint_least32_t int_type; + typedef streamoff off_type; +# ifndef _LIBCUDACXX_HAS_NO_WCHAR_H + typedef u32streampos pos_type; + typedef mbstate_t state_type; +# endif // !_LIBCUDACXX_HAS_NO_WCHAR_H + + _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void + assign(char_type& __c1, const char_type& __c2) noexcept + { + __c1 = __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept + { + return __c1 < __c2; + } + + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int + compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type* + find(const char_type* __s, size_t __n, const char_type& __a) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept; + _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept; + + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept + { + return eq_int_type(__c, eof()) ? ~eof() : __c; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept + { + return char_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept + { + return int_type(__c); + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept + { + return __c1 == __c2; + } + _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept + { + return int_type(0xFFFFFFFF); + } +}; + +inline _CCCL_CONSTEXPR_CXX14 int +char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + for (; __n; --__n, ++__s1, ++__s2) + { + if (lt(*__s1, *__s2)) + { + return -1; + } + if (lt(*__s2, *__s1)) + { + return 1; + } + } + return 0; +} + +inline _CCCL_CONSTEXPR_CXX14 size_t char_traits::length(const char_type* __s) noexcept +{ + size_t __len = 0; + for (; !eq(*__s, char_type(0)); ++__s) + { + ++__len; + } + return __len; +} + +inline _CCCL_CONSTEXPR_CXX14 const char32_t* +char_traits::find(const char_type* __s, size_t __n, const char_type& __a) noexcept +{ + for (; __n; --__n) + { + if (eq(*__s, __a)) + { + return __s; + } + ++__s; + } + return 0; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY char32_t* +char_traits::move(char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + char_type* __r = __s1; + if (__s1 < __s2) + { + for (; __n; --__n, ++__s1, ++__s2) + { + assign(*__s1, *__s2); + } + } + else if (__s2 < __s1) + { + __s1 += __n; + __s2 += __n; + for (; __n; --__n) + { + assign(*--__s1, *--__s2); + } + } + return __r; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY char32_t* +char_traits::copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept +{ + _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range"); + char_type* __r = __s1; + for (; __n; --__n, ++__s1, ++__s2) + { + assign(*__s1, *__s2); + } + return __r; +} + +inline _LIBCUDACXX_INLINE_VISIBILITY char32_t* +char_traits::assign(char_type* __s, size_t __n, char_type __a) noexcept +{ + char_type* __r = __s; + for (; __n; --__n, ++__s) + { + assign(*__s, __a); + } + return __r; +} + +#endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS + +// helper fns for basic_string and string_view + +// __str_find +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept +{ + if (__pos >= __sz) + { + return __npos; + } + const _CharT* __r = _Traits::find(__p + __pos, __sz - __pos, __c); + if (__r == 0) + { + return __npos; + } + return static_cast<_SizeT>(__r - __p); +} + +template +inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY const _CharT* +__search_substring(const _CharT* __first1, const _CharT* __last1, const _CharT* __first2, const _CharT* __last2) +{ + // Take advantage of knowing source and pattern lengths. + // Stop short when source is smaller than pattern. + const ptrdiff_t __len2 = __last2 - __first2; + if (__len2 == 0) + { + return __first1; + } + + ptrdiff_t __len1 = __last1 - __first1; + if (__len1 < __len2) + { + return __last1; + } + + // First element of __first2 is loop invariant. + _CharT __f2 = *__first2; + while (true) + { + __len1 = __last1 - __first1; + // Check whether __first1 still has at least __len2 bytes. + if (__len1 < __len2) + { + return __last1; + } + + // Find __f2 the first byte matching in __first1. + __first1 = _Traits::find(__first1, __len1 - __len2 + 1, __f2); + if (__first1 == 0) + { + return __last1; + } + + // It is faster to compare from the first byte of __first1 even if we + // already know that it matches the first byte of __first2: this is because + // __first2 is most likely aligned, as it is user's "pattern" string, and + // __first1 + 1 is most likely not aligned, as the match is in the middle of + // the string. + if (_Traits::compare(__first1, __first2, __len2) == 0) + { + return __first1; + } + + ++__first1; + } +} + +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept +{ + if (__pos > __sz) + { + return __npos; + } + + if (__n == 0) // There is nothing to search, just return __pos. + { + return __pos; + } + + const _CharT* __r = __search_substring<_CharT, _Traits>(__p + __pos, __p + __sz, __s, __s + __n); + + if (__r == __p + __sz) + { + return __npos; + } + return static_cast<_SizeT>(__r - __p); +} + +// __str_rfind + +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_rfind(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept +{ + if (__sz < 1) + { + return __npos; + } + if (__pos < __sz) + { + ++__pos; + } + else + { + __pos = __sz; + } + for (const _CharT* __ps = __p + __pos; __ps != __p;) + { + if (_Traits::eq(*--__ps, __c)) + { + return static_cast<_SizeT>(__ps - __p); + } + } + return __npos; +} + +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_rfind(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept +{ + __pos = _CUDA_VSTD::min(__pos, __sz); + if (__n < __sz - __pos) + { + __pos += __n; + } + else + { + __pos = __sz; + } + const _CharT* __r = _CUDA_VSTD::__find_end( + __p, __p + __pos, __s, __s + __n, _Traits::eq, random_access_iterator_tag(), random_access_iterator_tag()); + if (__n > 0 && __r == __p + __pos) + { + return __npos; + } + return static_cast<_SizeT>(__r - __p); +} + +// __str_find_first_of +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find_first_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept +{ + if (__pos >= __sz || __n == 0) + { + return __npos; + } + const _CharT* __r = _CUDA_VSTD::__find_first_of_ce(__p + __pos, __p + __sz, __s, __s + __n, _Traits::eq); + if (__r == __p + __sz) + { + return __npos; + } + return static_cast<_SizeT>(__r - __p); +} + +// __str_find_last_of +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find_last_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept +{ + if (__n != 0) + { + if (__pos < __sz) + { + ++__pos; + } + else + { + __pos = __sz; + } + for (const _CharT* __ps = __p + __pos; __ps != __p;) + { + const _CharT* __r = _Traits::find(__s, __n, *--__ps); + if (__r) + { + return static_cast<_SizeT>(__ps - __p); + } + } + } + return __npos; +} + +// __str_find_first_not_of +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find_first_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept +{ + if (__pos < __sz) + { + const _CharT* __pe = __p + __sz; + for (const _CharT* __ps = __p + __pos; __ps != __pe; ++__ps) + { + if (_Traits::find(__s, __n, *__ps) == 0) + { + return static_cast<_SizeT>(__ps - __p); + } + } + } + return __npos; +} + +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find_first_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept +{ + if (__pos < __sz) + { + const _CharT* __pe = __p + __sz; + for (const _CharT* __ps = __p + __pos; __ps != __pe; ++__ps) + { + if (!_Traits::eq(*__ps, __c)) + { + return static_cast<_SizeT>(__ps - __p); + } + } + } + return __npos; +} + +// __str_find_last_not_of +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find_last_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept +{ + if (__pos < __sz) + { + ++__pos; + } + else + { + __pos = __sz; + } + for (const _CharT* __ps = __p + __pos; __ps != __p;) + { + if (_Traits::find(__s, __n, *--__ps) == 0) + { + return static_cast<_SizeT>(__ps - __p); + } + } + return __npos; +} + +template +inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY +__str_find_last_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept +{ + if (__pos < __sz) + { + ++__pos; + } + else + { + __pos = __sz; + } + for (const _CharT* __ps = __p + __pos; __ps != __p;) + { + if (!_Traits::eq(*--__ps, __c)) + { + return static_cast<_SizeT>(__ps - __p); + } + } + return __npos; +} + +#ifndef __cuda_std__ +template +inline _LIBCUDACXX_INLINE_VISIBILITY size_t __do_string_hash(_Ptr __p, _Ptr __e) +{ + typedef typename iterator_traits<_Ptr>::value_type value_type; + return __murmur2_or_cityhash()(__p, (__e - __p) * sizeof(value_type)); +} +#endif // !__cuda_std__ + +template > +struct __quoted_output_proxy +{ + _Iter __first; + _Iter __last; + _CharT __delim; + _CharT __escape; + + _LIBCUDACXX_INLINE_VISIBILITY __quoted_output_proxy(_Iter __f, _Iter __l, _CharT __d, _CharT __e) + : __first(__f) + , __last(__l) + , __delim(__d) + , __escape(__e) + {} + // This would be a nice place for a string_ref +}; + +_LIBCUDACXX_END_NAMESPACE_STD + +_CCCL_POP_MACROS + +#endif // _LIBCUDACXX___STRING diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset b/libcudacxx/include/cuda/std/detail/libcxx/include/bitset deleted file mode 100644 index d61be09703..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset +++ /dev/null @@ -1,1027 +0,0 @@ -// -*- C++ -*- -//===---------------------------- bitset ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX_BITSET -#define _LIBCUDACXX_BITSET - -/* - bitset synopsis - -namespace std -{ - -namespace std { - -template -class bitset -{ -public: - // bit reference: - class reference - { - friend class bitset; - reference() noexcept; - public: - ~reference() noexcept; - reference& operator=(bool x) noexcept; // for b[i] = x; - reference& operator=(const reference&) noexcept; // for b[i] = b[j]; - bool operator~() const noexcept; // flips the bit - operator bool() const noexcept; // for x = b[i]; - reference& flip() noexcept; // for b[i].flip(); - }; - - // 23.3.5.1 constructors: - constexpr bitset() noexcept; - constexpr bitset(unsigned long long val) noexcept; - template - explicit bitset(const charT* str, - typename basic_string::size_type n = basic_string::npos, - charT zero = charT('0'), charT one = charT('1')); - template - explicit bitset(const basic_string& str, - typename basic_string::size_type pos = 0, - typename basic_string::size_type n = - basic_string::npos, - charT zero = charT('0'), charT one = charT('1')); - - // 23.3.5.2 bitset operations: - bitset& operator&=(const bitset& rhs) noexcept; - bitset& operator|=(const bitset& rhs) noexcept; - bitset& operator^=(const bitset& rhs) noexcept; - bitset& operator<<=(size_t pos) noexcept; - bitset& operator>>=(size_t pos) noexcept; - bitset& set() noexcept; - bitset& set(size_t pos, bool val = true); - bitset& reset() noexcept; - bitset& reset(size_t pos); - bitset operator~() const noexcept; - bitset& flip() noexcept; - bitset& flip(size_t pos); - - // element access: - constexpr bool operator[](size_t pos) const; // for b[i]; - reference operator[](size_t pos); // for b[i]; - unsigned long to_ulong() const; - unsigned long long to_ullong() const; - template - basic_string to_string(charT zero = charT('0'), charT one = charT('1')) const; - template - basic_string > to_string(charT zero = charT('0'), charT one = charT('1')) const; - template - basic_string, allocator > to_string(charT zero = charT('0'), charT one = -charT('1')) const; basic_string, allocator > to_string(char zero = '0', char one = '1') -const; size_t count() const noexcept; constexpr size_t size() const noexcept; bool operator==(const bitset& rhs) const -noexcept; bool operator!=(const bitset& rhs) const noexcept; bool test(size_t pos) const; bool all() const noexcept; - bool any() const noexcept; - bool none() const noexcept; - bitset operator<<(size_t pos) const noexcept; - bitset operator>>(size_t pos) const noexcept; -}; - -// 23.3.5.3 bitset operators: -template -bitset operator&(const bitset&, const bitset&) noexcept; - -template -bitset operator|(const bitset&, const bitset&) noexcept; - -template -bitset operator^(const bitset&, const bitset&) noexcept; - -template -basic_istream& -operator>>(basic_istream& is, bitset& x); - -template -basic_ostream& -operator<<(basic_ostream& os, const bitset& x); - -template struct hash>; - -} // std - -*/ - -#include <__bit_reference> -#include <__config> -#include <__functional_base> -#include -#include -#include -#include -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -_CCCL_POP_MACROS - -_LIBCUDACXX_BEGIN_NAMESPACE_STD - -template -class __bitset; - -template -struct __has_storage_type<__bitset<_N_words, _Size>> -{ - static const bool value = true; -}; - -template -class __bitset -{ -public: - typedef ptrdiff_t difference_type; - typedef size_t size_type; - typedef size_type __storage_type; - -protected: - typedef __bitset __self; - typedef __storage_type* __storage_pointer; - typedef const __storage_type* __const_storage_pointer; - static const unsigned __bits_per_word = static_cast(sizeof(__storage_type) * CHAR_BIT); - - friend class __bit_reference<__bitset>; - friend class __bit_const_reference<__bitset>; - friend class __bit_iterator<__bitset, false>; - friend class __bit_iterator<__bitset, true>; - friend struct __bit_array<__bitset>; - - __storage_type __first_[_N_words]; - - typedef __bit_reference<__bitset> reference; - typedef __bit_const_reference<__bitset> const_reference; - typedef __bit_iterator<__bitset, false> iterator; - typedef __bit_iterator<__bitset, true> const_iterator; - - _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept; - _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept; - - _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept - { - return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); - } - _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept - { - return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); - } - _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept - { - return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); - } - _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept - { - return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); - } - - _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept; - _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept; - _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept; - - void flip() noexcept; - _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const - { - return to_ulong(integral_constant < bool, _Size()); - } - _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const - { - return to_ullong(integral_constant < bool, _Size()); - } - - bool all() const noexcept; - bool any() const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept; - -private: - unsigned long to_ulong(false_type) const; - _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong(true_type) const; - unsigned long long to_ullong(false_type) const; - _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type) const; - _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type, false_type) const; - unsigned long long to_ullong(true_type, true_type) const; -}; - -template -inline constexpr __bitset<_N_words, _Size>::__bitset() noexcept - : __first_{0} -{} - -template -inline constexpr __bitset<_N_words, _Size>::__bitset(unsigned long long __v) noexcept -#if __SIZEOF_SIZE_T__ == 8 - : __first_{__v} -#elif __SIZEOF_SIZE_T__ == 4 - : __first_{static_cast<__storage_type>(__v), - _Size >= 2 * __bits_per_word - ? static_cast<__storage_type>(__v >> __bits_per_word) - : static_cast<__storage_type>( - (__v >> __bits_per_word) & (__storage_type(1) << (_Size - __bits_per_word)) - 1)} -#else -# error This constructor has not been ported to this platform -#endif -{} - -template -inline void __bitset<_N_words, _Size>::operator&=(const __bitset& __v) noexcept -{ - for (size_type __i = 0; __i < _N_words; ++__i) - { - __first_[__i] &= __v.__first_[__i]; - } -} - -template -inline void __bitset<_N_words, _Size>::operator|=(const __bitset& __v) noexcept -{ - for (size_type __i = 0; __i < _N_words; ++__i) - { - __first_[__i] |= __v.__first_[__i]; - } -} - -template -inline void __bitset<_N_words, _Size>::operator^=(const __bitset& __v) noexcept -{ - for (size_type __i = 0; __i < _N_words; ++__i) - { - __first_[__i] ^= __v.__first_[__i]; - } -} - -template -void __bitset<_N_words, _Size>::flip() noexcept -{ - // do middle whole words - size_type __n = _Size; - __storage_pointer __p = __first_; - for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word) - { - *__p = ~*__p; - } - // do last partial word - if (__n > 0) - { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - __storage_type __b = *__p & __m; - *__p &= ~__m; - *__p |= ~__b & __m; - } -} - -template -unsigned long __bitset<_N_words, _Size>::to_ulong(false_type) const -{ - const_iterator __e = __make_iter(_Size); - const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true); - if (__i != __e) - { - __throw_overflow_error("bitset to_ulong overflow error"); - } - - return __first_[0]; -} - -template -inline unsigned long __bitset<_N_words, _Size>::to_ulong(true_type) const -{ - return __first_[0]; -} - -template -unsigned long long __bitset<_N_words, _Size>::to_ullong(false_type) const -{ - const_iterator __e = __make_iter(_Size); - const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true); - if (__i != __e) - { - __throw_overflow_error("bitset to_ullong overflow error"); - } - - return to_ullong(true_type()); -} - -template -inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type) const -{ - return to_ullong(true_type(), integral_constant()); -} - -template -inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, false_type) const -{ - return __first_[0]; -} - -template -unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, true_type) const -{ - unsigned long long __r = __first_[0]; - for (std::size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i) - { - __r |= static_cast(__first_[__i]) << (sizeof(__storage_type) * CHAR_BIT); - } - return __r; -} - -template -bool __bitset<_N_words, _Size>::all() const noexcept -{ - // do middle whole words - size_type __n = _Size; - __const_storage_pointer __p = __first_; - for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word) - { - if (~*__p) - { - return false; - } - } - // do last partial word - if (__n > 0) - { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - if (~*__p & __m) - { - return false; - } - } - return true; -} - -template -bool __bitset<_N_words, _Size>::any() const noexcept -{ - // do middle whole words - size_type __n = _Size; - __const_storage_pointer __p = __first_; - for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word) - { - if (*__p) - { - return true; - } - } - // do last partial word - if (__n > 0) - { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - if (*__p & __m) - { - return true; - } - } - return false; -} - -template -inline size_t __bitset<_N_words, _Size>::__hash_code() const noexcept -{ - size_t __h = 0; - for (size_type __i = 0; __i < _N_words; ++__i) - { - __h ^= __first_[__i]; - } - return __h; -} - -template -class __bitset<1, _Size> -{ -public: - typedef ptrdiff_t difference_type; - typedef size_t size_type; - typedef size_type __storage_type; - -protected: - typedef __bitset __self; - typedef __storage_type* __storage_pointer; - typedef const __storage_type* __const_storage_pointer; - static const unsigned __bits_per_word = static_cast(sizeof(__storage_type) * CHAR_BIT); - - friend class __bit_reference<__bitset>; - friend class __bit_const_reference<__bitset>; - friend class __bit_iterator<__bitset, false>; - friend class __bit_iterator<__bitset, true>; - friend struct __bit_array<__bitset>; - - __storage_type __first_; - - typedef __bit_reference<__bitset> reference; - typedef __bit_const_reference<__bitset> const_reference; - typedef __bit_iterator<__bitset, false> iterator; - typedef __bit_iterator<__bitset, true> const_iterator; - - _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept; - _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept; - - _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept - { - return reference(&__first_, __storage_type(1) << __pos); - } - _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept - { - return const_reference(&__first_, __storage_type(1) << __pos); - } - _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept - { - return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); - } - _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept - { - return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); - } - - _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept; - _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept; - _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept; - - _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept; - - _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const; - _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const; - - _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept; - - _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept; -}; - -template -inline constexpr __bitset<1, _Size>::__bitset() noexcept - : __first_(0) -{} - -template -inline constexpr __bitset<1, _Size>::__bitset(unsigned long long __v) noexcept - : __first_(_Size == __bits_per_word ? static_cast<__storage_type>(__v) - : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - 1)) -{} - -template -inline void __bitset<1, _Size>::operator&=(const __bitset& __v) noexcept -{ - __first_ &= __v.__first_; -} - -template -inline void __bitset<1, _Size>::operator|=(const __bitset& __v) noexcept -{ - __first_ |= __v.__first_; -} - -template -inline void __bitset<1, _Size>::operator^=(const __bitset& __v) noexcept -{ - __first_ ^= __v.__first_; -} - -template -inline void __bitset<1, _Size>::flip() noexcept -{ - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); - __first_ = ~__first_; - __first_ &= __m; -} - -template -inline unsigned long __bitset<1, _Size>::to_ulong() const -{ - return __first_; -} - -template -inline unsigned long long __bitset<1, _Size>::to_ullong() const -{ - return __first_; -} - -template -inline bool __bitset<1, _Size>::all() const noexcept -{ - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); - return !(~__first_ & __m); -} - -template -inline bool __bitset<1, _Size>::any() const noexcept -{ - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); - return __first_ & __m; -} - -template -inline size_t __bitset<1, _Size>::__hash_code() const noexcept -{ - return __first_; -} - -template <> -class __bitset<0, 0> -{ -public: - typedef ptrdiff_t difference_type; - typedef size_t size_type; - typedef size_type __storage_type; - -protected: - typedef __bitset __self; - typedef __storage_type* __storage_pointer; - typedef const __storage_type* __const_storage_pointer; - static const unsigned __bits_per_word = static_cast(sizeof(__storage_type) * CHAR_BIT); - - friend class __bit_reference<__bitset>; - friend class __bit_const_reference<__bitset>; - friend class __bit_iterator<__bitset, false>; - friend class __bit_iterator<__bitset, true>; - friend struct __bit_array<__bitset>; - - typedef __bit_reference<__bitset> reference; - typedef __bit_const_reference<__bitset> const_reference; - typedef __bit_iterator<__bitset, false> iterator; - typedef __bit_iterator<__bitset, true> const_iterator; - - _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept; - _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long) noexcept; - - _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t) noexcept - { - return reference(0, 1); - } - _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept - { - return const_reference(0, 1); - } - _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t) noexcept - { - return iterator(0, 0); - } - _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t) const noexcept - { - return const_iterator(0, 0); - } - - _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset&) noexcept {} - _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset&) noexcept {} - _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset&) noexcept {} - - _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {} - - _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const - { - return 0; - } - _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const - { - return 0; - } - - _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept - { - return true; - } - _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept - { - return false; - } - - _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept - { - return 0; - } -}; - -inline constexpr __bitset<0, 0>::__bitset() noexcept {} - -inline constexpr __bitset<0, 0>::__bitset(unsigned long long) noexcept {} - -template -class _LIBCUDACXX_TEMPLATE_VIS bitset; -template -struct hash>; - -template -class _LIBCUDACXX_TEMPLATE_VIS bitset - : private __bitset<_Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1, _Size> -{ -public: - static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1; - typedef __bitset<__n_words, _Size> base; - -public: - typedef typename base::reference reference; - typedef typename base::const_reference const_reference; - - // 23.3.5.1 constructors: - _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {} - _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset(unsigned long long __v) noexcept - : base(__v) - {} - template ::value>> - explicit bitset(const _CharT* __str, - typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos, - _CharT __zero = _CharT('0'), - _CharT __one = _CharT('1')); - template - explicit bitset(const basic_string<_CharT, _Traits, _Allocator>& __str, - typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0, - typename basic_string<_CharT, _Traits, _Allocator>::size_type __n = - (basic_string<_CharT, _Traits, _Allocator>::npos), - _CharT __zero = _CharT('0'), - _CharT __one = _CharT('1')); - - // 23.3.5.2 bitset operations: - _LIBCUDACXX_INLINE_VISIBILITY bitset& operator&=(const bitset& __rhs) noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bitset& operator|=(const bitset& __rhs) noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bitset& operator^=(const bitset& __rhs) noexcept; - bitset& operator<<=(size_t __pos) noexcept; - bitset& operator>>=(size_t __pos) noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bitset& set() noexcept; - bitset& set(size_t __pos, bool __val = true); - _LIBCUDACXX_INLINE_VISIBILITY bitset& reset() noexcept; - bitset& reset(size_t __pos); - _LIBCUDACXX_INLINE_VISIBILITY bitset operator~() const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bitset& flip() noexcept; - bitset& flip(size_t __pos); - - // element access: - _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference operator[](size_t __p) const - { - return base::__make_ref(__p); - } - _LIBCUDACXX_INLINE_VISIBILITY reference operator[](size_t __p) - { - return base::__make_ref(__p); - } - _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const; - _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const; - template - basic_string<_CharT, _Traits, _Allocator> to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const; - template - _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, _Traits, allocator<_CharT>> - to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const; - template - _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>> - to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const; - _LIBCUDACXX_INLINE_VISIBILITY basic_string, allocator> - to_string(char __zero = '0', char __one = '1') const; - _LIBCUDACXX_INLINE_VISIBILITY size_t count() const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept - { - return _Size; - } - _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const bitset& __rhs) const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const bitset& __rhs) const noexcept; - bool test(size_t __pos) const; - _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bool none() const noexcept - { - return !any(); - } - _LIBCUDACXX_INLINE_VISIBILITY bitset operator<<(size_t __pos) const noexcept; - _LIBCUDACXX_INLINE_VISIBILITY bitset operator>>(size_t __pos) const noexcept; - -private: - _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept - { - return base::__hash_code(); - } - - friend struct hash; -}; - -template -template -bitset<_Size>::bitset(const _CharT* __str, typename basic_string<_CharT>::size_type __n, _CharT __zero, _CharT __one) -{ - size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str)); - for (size_t __i = 0; __i < __rlen; ++__i) - { - if (__str[__i] != __zero && __str[__i] != __one) - { - __throw_invalid_argument("bitset string ctor has invalid argument"); - } - } - - size_t _Mp = _CUDA_VSTD::min(__rlen, _Size); - size_t __i = 0; - for (; __i < _Mp; ++__i) - { - _CharT __c = __str[_Mp - 1 - __i]; - if (__c == __zero) - { - (*this)[__i] = false; - } - else - { - (*this)[__i] = true; - } - } - _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false); -} - -template -template -bitset<_Size>::bitset( - const basic_string<_CharT, _Traits, _Allocator>& __str, - typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos, - typename basic_string<_CharT, _Traits, _Allocator>::size_type __n, - _CharT __zero, - _CharT __one) -{ - if (__pos > __str.size()) - { - _CUDA_VSTD::__throw_out_of_range("bitset string pos out of range"); - } - - size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos); - for (size_t __i = __pos; __i < __pos + __rlen; ++__i) - { - if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one)) - { - __throw_invalid_argument("bitset string ctor has invalid argument"); - } - } - - size_t _Mp = _CUDA_VSTD::min(__rlen, _Size); - size_t __i = 0; - for (; __i < _Mp; ++__i) - { - _CharT __c = __str[__pos + _Mp - 1 - __i]; - if (_Traits::eq(__c, __zero)) - { - (*this)[__i] = false; - } - else - { - (*this)[__i] = true; - } - } - _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false); -} - -template -inline bitset<_Size>& bitset<_Size>::operator&=(const bitset& __rhs) noexcept -{ - base::operator&=(__rhs); - return *this; -} - -template -inline bitset<_Size>& bitset<_Size>::operator|=(const bitset& __rhs) noexcept -{ - base::operator|=(__rhs); - return *this; -} - -template -inline bitset<_Size>& bitset<_Size>::operator^=(const bitset& __rhs) noexcept -{ - base::operator^=(__rhs); - return *this; -} - -template -bitset<_Size>& bitset<_Size>::operator<<=(size_t __pos) noexcept -{ - __pos = _CUDA_VSTD::min(__pos, _Size); - _CUDA_VSTD::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size)); - _CUDA_VSTD::fill_n(base::__make_iter(0), __pos, false); - return *this; -} - -template -bitset<_Size>& bitset<_Size>::operator>>=(size_t __pos) noexcept -{ - __pos = _CUDA_VSTD::min(__pos, _Size); - _CUDA_VSTD::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0)); - _CUDA_VSTD::fill_n(base::__make_iter(_Size - __pos), __pos, false); - return *this; -} - -template -inline bitset<_Size>& bitset<_Size>::set() noexcept -{ - _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true); - return *this; -} - -template -bitset<_Size>& bitset<_Size>::set(size_t __pos, bool __val) -{ - if (__pos >= _Size) - { - _CUDA_VSTD::__throw_out_of_range("bitset set argument out of range"); - } - - (*this)[__pos] = __val; - return *this; -} - -template -inline bitset<_Size>& bitset<_Size>::reset() noexcept -{ - _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false); - return *this; -} - -template -bitset<_Size>& bitset<_Size>::reset(size_t __pos) -{ - if (__pos >= _Size) - { - _CUDA_VSTD::__throw_out_of_range("bitset reset argument out of range"); - } - - (*this)[__pos] = false; - return *this; -} - -template -inline bitset<_Size> bitset<_Size>::operator~() const noexcept -{ - bitset __x(*this); - __x.flip(); - return __x; -} - -template -inline bitset<_Size>& bitset<_Size>::flip() noexcept -{ - base::flip(); - return *this; -} - -template -bitset<_Size>& bitset<_Size>::flip(size_t __pos) -{ - if (__pos >= _Size) - { - _CUDA_VSTD::__throw_out_of_range("bitset flip argument out of range"); - } - - reference r = base::__make_ref(__pos); - r = ~r; - return *this; -} - -template -inline unsigned long bitset<_Size>::to_ulong() const -{ - return base::to_ulong(); -} - -template -inline unsigned long long bitset<_Size>::to_ullong() const -{ - return base::to_ullong(); -} - -template -template -basic_string<_CharT, _Traits, _Allocator> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const -{ - basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero); - for (size_t __i = 0; __i < _Size; ++__i) - { - if ((*this)[__i]) - { - __r[_Size - 1 - __i] = __one; - } - } - return __r; -} - -template -template -inline basic_string<_CharT, _Traits, allocator<_CharT>> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const -{ - return to_string<_CharT, _Traits, allocator<_CharT>>(__zero, __one); -} - -template -template -inline basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>> -bitset<_Size>::to_string(_CharT __zero, _CharT __one) const -{ - return to_string<_CharT, char_traits<_CharT>, allocator<_CharT>>(__zero, __one); -} - -template -inline basic_string, allocator> bitset<_Size>::to_string(char __zero, char __one) const -{ - return to_string, allocator>(__zero, __one); -} - -template -inline size_t bitset<_Size>::count() const noexcept -{ - return static_cast(__count_bool_true(base::__make_iter(0), _Size)); -} - -template -inline bool bitset<_Size>::operator==(const bitset& __rhs) const noexcept -{ - return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0)); -} - -template -inline bool bitset<_Size>::operator!=(const bitset& __rhs) const noexcept -{ - return !(*this == __rhs); -} - -template -bool bitset<_Size>::test(size_t __pos) const -{ - if (__pos >= _Size) - { - _CUDA_VSTD::__throw_out_of_range("bitset test argument out of range"); - } - - return (*this)[__pos]; -} - -template -inline bool bitset<_Size>::all() const noexcept -{ - return base::all(); -} - -template -inline bool bitset<_Size>::any() const noexcept -{ - return base::any(); -} - -template -inline bitset<_Size> bitset<_Size>::operator<<(size_t __pos) const noexcept -{ - bitset __r = *this; - __r <<= __pos; - return __r; -} - -template -inline bitset<_Size> bitset<_Size>::operator>>(size_t __pos) const noexcept -{ - bitset __r = *this; - __r >>= __pos; - return __r; -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept -{ - bitset<_Size> __r = __x; - __r &= __y; - return __r; -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept -{ - bitset<_Size> __r = __x; - __r |= __y; - return __r; -} - -template -inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept -{ - bitset<_Size> __r = __x; - __r ^= __y; - return __r; -} - -template -struct _LIBCUDACXX_TEMPLATE_VIS hash> : public __unary_function, size_t> -{ - _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const bitset<_Size>& __bs) const noexcept - { - return __bs.__hash_code(); - } -}; - -template -basic_istream<_CharT, _Traits>& operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x); - -template -basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x); - -_LIBCUDACXX_END_NAMESPACE_STD - -_CCCL_POP_MACROS - -#endif // _LIBCUDACXX_BITSET diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef b/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef index 73d0f12b90..749931900f 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef @@ -57,7 +57,7 @@ using ::ptrdiff_t; using ::size_t; #if defined(__CLANG_MAX_ALIGN_T_DEFINED) || defined(_GCC_MAX_ALIGN_T) || defined(__DEFINED_max_align_t) \ - || defined(__NetBSD__) + || defined(__NetBS) // Re-use the compiler's max_align_t where possible. using ::max_align_t; #else diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp new file mode 100644 index 0000000000..2e5364ab2c --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp @@ -0,0 +1,159 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// template +// explicit bitset(const charT* str, +// typename basic_string_view::size_type n = basic_string_view::npos, // +// s/string/string_view since C++26 charT zero = charT('0'), charT one = charT('1')); // constexpr +// since C++23 + +#include +#include +// #include // for 'min' and 'max' +// #include // for 'invalid_argument' + +#include "test_macros.h" + +// TEST_MSVC_DIAGNOSTIC_IGNORED(6294) // Ill-defined for-loop: initial condition does not satisfy test. Loop body not +// executed. +_CCCL_NV_DIAG_SUPPRESS(186) + +#ifndef TEST_HAS_NO_EXCEPTIONS +template +void test_char_pointer_ctor_throw() +{ + try + { + cuda::std::bitset v("xxx1010101010xxxx"); + assert(false); + } + catch (std::invalid_argument&) + {} +} + +void test_exceptions() +{ + test_char_pointer_ctor_throw<0>(); + test_char_pointer_ctor_throw<1>(); + test_char_pointer_ctor_throw<31>(); + test_char_pointer_ctor_throw<32>(); + test_char_pointer_ctor_throw<33>(); + test_char_pointer_ctor_throw<63>(); + test_char_pointer_ctor_throw<64>(); + test_char_pointer_ctor_throw<65>(); + test_char_pointer_ctor_throw<1000>(); +} +#endif + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_char_pointer_ctor() +{ + static_assert(!cuda::std::is_convertible>::value, ""); + static_assert(cuda::std::is_constructible, const char*>::value, ""); + { + const char s[] = "1010101010"; + cuda::std::bitset v(s); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[M - 1 - i] == '1')); + } + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v[i] == false); + } + } + } + { + const char s[] = "1010101010"; + cuda::std::bitset v(s, 10); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[M - 1 - i] == '1')); + } + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v[i] == false); + } + } + } + { + const char s[] = "1a1a1a1a1a"; + cuda::std::bitset v(s, 10, 'a'); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[M - 1 - i] == '1')); + } + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v[i] == false); + } + } + } + { + const char s[] = "bababababa"; + cuda::std::bitset v(s, 10, 'a', 'b'); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[M - 1 - i] == 'b')); + } + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v[i] == false); + } + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_char_pointer_ctor<0>(); + test_char_pointer_ctor<1>(); + test_char_pointer_ctor<31>(); + test_char_pointer_ctor<32>(); + test_char_pointer_ctor<33>(); + test_char_pointer_ctor<63>(); + test_char_pointer_ctor<64>(); + test_char_pointer_ctor<65>(); + test_char_pointer_ctor<1000>(); + + return true; +} + +int main(int, char**) +{ +#ifndef TEST_HAS_NO_EXCEPTIONS + NV_IF_TARGET(NV_IS_HOST, (test_exceptions();)) +#endif + + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp new file mode 100644 index 0000000000..8988d271c0 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// default ctor + +#include +#include + +#include "test_macros.h" + +TEST_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_default_ctor() +{ + { + TEST_CONSTEXPR cuda::std::bitset v1; + assert(v1.size() == N); + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v1[i] == false); + } + } + } +#if TEST_STD_VER >= 11 + { + constexpr cuda::std::bitset v1; + static_assert(v1.size() == N, ""); + } +#endif +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_default_ctor<0>(); + test_default_ctor<1>(); + test_default_ctor<31>(); + test_default_ctor<32>(); + test_default_ctor<33>(); + test_default_ctor<63>(); + test_default_ctor<64>(); + test_default_ctor<65>(); + test_default_ctor<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp new file mode 100644 index 0000000000..4501345c57 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp @@ -0,0 +1,196 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset(string, pos, n, zero, one); // constexpr since C++23 + +#include + +#ifndef _LIBCUDACXX_HAS_STRING +int main(int, char**) +{ + return 0; +} +#else + +# include // for 'min' and 'max' +# include +# include +# include // for 'invalid_argument' +# include +# include + +# include "test_macros.h" + +template +TEST_CONSTEXPR_CXX14 void test_string_ctor() +{ +# ifndef TEST_HAS_NO_EXCEPTIONS + if (!TEST_IS_CONSTANT_EVALUATED) + { + try + { + cuda::std::string s("xxx1010101010xxxx"); + cuda::std::bitset v(s, s.size() + 1); + assert(false); + } + catch (cuda::std::out_of_range&) + {} + try + { + cuda::std::string s("xxx1010101010xxxx"); + cuda::std::bitset v(s, s.size() + 1, 10); + assert(false); + } + catch (cuda::std::out_of_range&) + {} + try + { + cuda::std::string s("xxx1010101010xxxx"); + cuda::std::bitset v(s); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + try + { + cuda::std::string s("xxx1010101010xxxx"); + cuda::std::bitset v(s, 2); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + try + { + cuda::std::string s("xxx1010101010xxxx"); + cuda::std::bitset v(s, 2, 10); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + try + { + cuda::std::string s("xxxbababababaxxxx"); + cuda::std::bitset v(s, 2, 10, 'a', 'b'); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + } +# endif // TEST_HAS_NO_EXCEPTIONS + + static_assert(!cuda::std::is_convertible>::value, ""); + static_assert(cuda::std::is_constructible, cuda::std::string>::value, ""); + { + cuda::std::string s("1010101010"); + cuda::std::bitset v(s); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string s("xxx1010101010"); + cuda::std::bitset v(s, 3); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string s("xxx1010101010xxxx"); + cuda::std::bitset v(s, 3, 10); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string s("xxx1a1a1a1a1axxxx"); + cuda::std::bitset v(s, 3, 10, 'a'); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string s("xxxbababababaxxxx"); + cuda::std::bitset v(s, 3, 10, 'a', 'b'); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == 'b')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } +} + +struct Nonsense +{ + virtual ~Nonsense() {} +}; + +TEST_CONSTEXPR_CXX14 void test_for_non_eager_instantiation() +{ + // Ensure we don't accidentally instantiate `cuda::std::basic_string` + // since it may not be well formed and can cause an error in the + // non-immediate context. + static_assert(!cuda::std::is_constructible, Nonsense*>::value, ""); + static_assert( + !cuda::std::is_constructible, Nonsense*, cuda::std::size_t, Nonsense&, Nonsense&>::value, ""); +} + +TEST_CONSTEXPR_CXX14 bool test() +{ + test_string_ctor<0>(); + test_string_ctor<1>(); + test_string_ctor<31>(); + test_string_ctor<32>(); + test_string_ctor<33>(); + test_string_ctor<63>(); + test_string_ctor<64>(); + test_string_ctor<65>(); + test_string_ctor<1000>(); + test_for_non_eager_instantiation(); + + return true; +} + +int main(int, char**) +{ + test(); +# if TEST_STD_VER >= 2023 + static_assert(test(), ""); +# endif + + return 0; +} + +#endif diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp new file mode 100644 index 0000000000..060e1ead7a --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp @@ -0,0 +1,201 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 + +// template +// explicit bitset( +// const basic_string_view& str, +// typename basic_string_view::size_type pos = 0, +// typename basic_string_view::size_type n = basic_string_view::npos, +// charT zero = charT('0'), charT one = charT('1')); + +#include + +#ifndef _LIBCUDACXX_HAS_STRING_VIEW +int main(int, char**) +{ + return 0; +} +#else + +# include // for 'min' and 'max' +# include +# include +# include // for 'invalid_argument' +# include +# include + +# include "test_macros.h" + +template +constexpr void test_string_ctor() +{ +# ifndef TEST_HAS_NO_EXCEPTIONS + if (!TEST_IS_CONSTANT_EVALUATED) + { + try + { + cuda::std::string_view s("xxx1010101010xxxx"); + cuda::std::bitset v(s, s.size() + 1); + assert(false); + } + catch (cuda::std::out_of_range&) + {} + try + { + cuda::std::string_view s("xxx1010101010xxxx"); + cuda::std::bitset v(s, s.size() + 1, 10); + assert(false); + } + catch (cuda::std::out_of_range&) + {} + try + { + cuda::std::string_view s("xxx1010101010xxxx"); + cuda::std::bitset v(s); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + try + { + cuda::std::string_view s("xxx1010101010xxxx"); + cuda::std::bitset v(s, 2); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + try + { + cuda::std::string_view s("xxx1010101010xxxx"); + cuda::std::bitset v(s, 2, 10); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + try + { + cuda::std::string_view s("xxxbababababaxxxx"); + cuda::std::bitset v(s, 2, 10, 'a', 'b'); + assert(false); + } + catch (cuda::std::invalid_argument&) + {} + } +# endif // TEST_HAS_NO_EXCEPTIONS + + static_assert(!cuda::std::is_convertible_v>, ""); + static_assert(cuda::std::is_constructible_v, cuda::std::string_view>, ""); + { + cuda::std::string_view s("1010101010"); + cuda::std::bitset v(s); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string_view s("xxx1010101010"); + cuda::std::bitset v(s, 3); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string_view s("xxx1010101010xxxx"); + cuda::std::bitset v(s, 3, 10); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string_view s("xxx1a1a1a1a1axxxx"); + cuda::std::bitset v(s, 3, 10, 'a'); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == '1')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } + { + cuda::std::string_view s("xxxbababababaxxxx"); + cuda::std::bitset v(s, 3, 10, 'a', 'b'); + cuda::std::size_t M = cuda::std::min(v.size(), 10); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == (s[3 + M - 1 - i] == 'b')); + } + for (cuda::std::size_t i = 10; i < v.size(); ++i) + { + assert(v[i] == false); + } + } +} + +struct Nonsense +{ + virtual ~Nonsense() {} +}; + +constexpr void test_for_non_eager_instantiation() +{ + // Ensure we don't accidentally instantiate `cuda::std::basic_string_view` + // since it may not be well formed and can cause an error in the + // non-immediate context. + static_assert(!cuda::std::is_constructible, Nonsense*>::value, ""); + static_assert( + !cuda::std::is_constructible, Nonsense*, cuda::std::size_t, Nonsense&, Nonsense&>::value, ""); +} + +constexpr bool test() +{ + test_string_ctor<0>(); + test_string_ctor<1>(); + test_string_ctor<31>(); + test_string_ctor<32>(); + test_string_ctor<33>(); + test_string_ctor<63>(); + test_string_ctor<64>(); + test_string_ctor<65>(); + test_string_ctor<1000>(); + test_for_non_eager_instantiation(); + + return true; +} + +int main(int, char**) +{ + test(); + static_assert(test(), ""); + + return 0; +} + +#endif diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp new file mode 100644 index 0000000000..cbe955b61d --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset(unsigned long long val); // constexpr since C++23 + +#include +#include +// #include // for 'min' and 'max' +#include + +#include "test_macros.h" + +// TEST_MSVC_DIAGNOSTIC_IGNORED(6294) // Ill-defined for-loop: initial condition does not satisfy test. Loop body not +// executed. +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_val_ctor() +{ + { + TEST_CONSTEXPR cuda::std::bitset v(0xAAAAAAAAAAAAAAAAULL); + assert(v.size() == N); + cuda::std::size_t M = cuda::std::min(v.size(), 64); + for (cuda::std::size_t i = 0; i < M; ++i) + { + assert(v[i] == ((i & 1) != 0)); + } + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = M; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v[i] == false); + } + } + } + { + constexpr cuda::std::bitset v(0xAAAAAAAAAAAAAAAAULL); + static_assert(v.size() == N, ""); + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_val_ctor<0>(); + test_val_ctor<1>(); + test_val_ctor<31>(); + test_val_ctor<32>(); + test_val_ctor<33>(); + test_val_ctor<63>(); + test_val_ctor<64>(); + test_val_ctor<65>(); + test_val_ctor<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp new file mode 100644 index 0000000000..76bc4de94d --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bool all() const; // constexpr since C++23 + +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_all() +{ + cuda::std::bitset v; + v.reset(); + assert(v.all() == (N == 0)); + v.set(); + assert(v.all() == true); + if (v.size() > 1) + { + v[N / 2] = false; + assert(v.all() == false); + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_all<0>(); + test_all<1>(); + test_all<31>(); + test_all<32>(); + test_all<33>(); + test_all<63>(); + test_all<64>(); + test_all<65>(); + test_all<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp new file mode 100644 index 0000000000..f4549ae6ac --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bool any() const; // constexpr since C++23 + +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_any() +{ + cuda::std::bitset v; + v.reset(); + assert(v.any() == false); + v.set(); + assert(v.any() == (N != 0)); + if (v.size() > 1) + { + v[N / 2] = false; + assert(v.any() == true); + v.reset(); + v[N / 2] = true; + assert(v.any() == true); + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_any<0>(); + test_any<1>(); + test_any<31>(); + test_any<32>(); + test_any<33>(); + test_any<63>(); + test_any<64>(); + test_any<65>(); + test_any<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp new file mode 100644 index 0000000000..233b40fa98 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// size_t count() const; // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_count() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + const cuda::std::bitset v(cases[c]); + cuda::std::size_t c1 = v.count(); + cuda::std::size_t c2 = 0; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + if (v[i]) + { + ++c2; + } + } + } + assert(c1 == c2); + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_count<0>(); + test_count<1>(); + test_count<31>(); + test_count<32>(); + test_count<33>(); + test_count<63>(); + test_count<64>(); + test_count<65>(); + + return true; +} + +int main(int, char**) +{ + test(); + test_count<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp new file mode 100644 index 0000000000..b9f9f2b897 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& flip(); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_flip_all() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset v1(cases[c]); + cuda::std::bitset v2 = v1; + v2.flip(); + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v2[i] == ~v1[i]); + } + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_flip_all<0>(); + test_flip_all<1>(); + test_flip_all<31>(); + test_flip_all<32>(); + test_flip_all<33>(); + test_flip_all<63>(); + test_flip_all<64>(); + test_flip_all<65>(); + + return true; +} + +int main(int, char**) +{ + test(); + test_flip_all<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp new file mode 100644 index 0000000000..31b2a5995b --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-no-exceptions + +// bitset& flip(size_t pos); // constexpr since C++23 + +// Make sure we throw ::std::out_of_range when calling flip() on an OOB index. + +#include +#include + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + { + cuda::std::bitset<0> v; + try + { + v.flip(0); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<1> v("0"); + try + { + v.flip(2); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<10> v("0000000000"); + try + { + v.flip(10); + assert(false); + } + catch (::std::out_of_range const&) + {} + }) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp new file mode 100644 index 0000000000..2d6ebd3eea --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& flip(size_t pos); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_flip_one() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset v(cases[c]); + if (v.size() > 0) + { + cuda::std::size_t middle = v.size() / 2; + v.flip(middle); + bool b = v[middle]; + assert(v[middle] == b); + v.flip(middle); + assert(v[middle] != b); + v.flip(middle); + assert(v[middle] == b); + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_flip_one<0>(); + test_flip_one<1>(); + test_flip_one<31>(); + test_flip_one<32>(); + test_flip_one<33>(); + test_flip_one<63>(); + test_flip_one<64>(); + test_flip_one<65>(); + + return true; +} + +int main(int, char**) +{ + test(); + test_flip_one<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp new file mode 100644 index 0000000000..a34fbe8e11 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset::reference operator[](size_t pos); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_index() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset v1(cases[c]); + if (v1.size() > 0) + { + assert(v1[N / 2] == v1.test(N / 2)); + typename cuda::std::bitset::reference r = v1[N / 2]; + assert(r == v1.test(N / 2)); + typename cuda::std::bitset::reference r2 = v1[N / 2]; + r = r2; + assert(r == v1.test(N / 2)); + r = false; + assert(r == false); + assert(v1.test(N / 2) == false); + r = true; + assert(r == true); + assert(v1.test(N / 2) == true); + bool b = ~r; + assert(r == true); + assert(v1.test(N / 2) == true); + assert(b == false); + r.flip(); + assert(r == false); + assert(v1.test(N / 2) == false); + } + ASSERT_SAME_TYPE(decltype(v1[0]), typename cuda::std::bitset::reference); + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_index<0>(); + test_index<1>(); + test_index<31>(); + test_index<32>(); + test_index<33>(); + test_index<63>(); + test_index<64>(); + test_index<65>(); + + cuda::std::bitset<1> set; + set[0] = false; + auto b = set[0]; + set[0] = true; + assert(b); + + return true; +} + +int main(int, char**) +{ + test(); + test_index<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp new file mode 100644 index 0000000000..eacfa4e54b --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// constexpr bool operator[](size_t pos) const; // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_index_const() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset const v(cases[c]); + if (v.size() > 0) + { + assert(v[N / 2] == v.test(N / 2)); + } +#if !defined(_LIBCUDACXX_VERSION) || defined(_LIBCUDACXX_ABI_BITSET_span_BOOL_CONST_SUBSCRIPT_RETURN_BOOL) + ASSERT_SAME_TYPE(decltype(v[0]), bool); +#else + ASSERT_SAME_TYPE(decltype(v[0]), typename cuda::std::bitset::const_reference); +#endif + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_index_const<0>(); + test_index_const<1>(); + test_index_const<31>(); + test_index_const<32>(); + test_index_const<33>(); + test_index_const<63>(); + test_index_const<64>(); + test_index_const<65>(); + + cuda::std::bitset<1> set_; + set_[0] = false; + const auto& set = set_; + auto b = set[0]; + set_[0] = true; +#if !defined(_LIBCUDACXX_VERSION) || defined(_LIBCUDACXX_ABI_BITSET_span_BOOL_CONST_SUBSCRIPT_RETURN_BOOL) + assert(!b); +#else + assert(b); +#endif + + return true; +} + +int main(int, char**) +{ + test(); + test_index_const<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp new file mode 100644 index 0000000000..ceecfab0a1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// The CI "Apple back-deployment with assertions enabled" needs a higher value +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12712420 + +// bitset operator<<(size_t pos) const; // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template (-1)> +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_left_shift() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + if (Start == 9) + { + assert(End >= cases.size()); + } + for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c) + { + for (cuda::std::size_t s = 0; s <= N + 1; ++s) + { + cuda::std::bitset v1(cases[c]); + cuda::std::bitset v2 = v1; + assert((v1 <<= s) == (v2 << s)); + } + } + + return true; +} + +int main(int, char**) +{ + test_left_shift<0>(); + test_left_shift<1>(); + test_left_shift<31>(); + test_left_shift<32>(); + test_left_shift<33>(); + test_left_shift<63>(); + test_left_shift<64>(); + test_left_shift<65>(); + test_left_shift<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_left_shift<0>(), ""); + static_assert(test_left_shift<1>(), ""); + static_assert(test_left_shift<31>(), ""); + static_assert(test_left_shift<32>(), ""); + static_assert(test_left_shift<33>(), ""); + static_assert(test_left_shift<63, 0, 6>(), ""); + static_assert(test_left_shift<63, 6>(), ""); + static_assert(test_left_shift<64, 0, 6>(), ""); + static_assert(test_left_shift<64, 6>(), ""); + static_assert(test_left_shift<65, 0, 3>(), ""); + static_assert(test_left_shift<65, 3, 6>(), ""); + static_assert(test_left_shift<65, 6, 9>(), ""); + static_assert(test_left_shift<65, 9>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp new file mode 100644 index 0000000000..8572c1376c --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& operator<<=(size_t pos); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template (-1)> +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_left_shift() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + if (Start >= 9) + { + assert(End >= cases.size()); + } + for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c) + { + for (cuda::std::size_t s = 0; s <= N + 1; ++s) + { + cuda::std::bitset v1(cases[c]); + cuda::std::bitset v2 = v1; + v1 <<= s; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + _CCCL_DIAG_POP + { + if (i < s) + { + assert(v1[i] == 0); + } + else + { + assert(v1[i] == v2[i - s]); + } + } + } + } + } + return true; +} + +int main(int, char**) +{ + test_left_shift<0>(); + test_left_shift<1>(); + test_left_shift<31>(); + test_left_shift<32>(); + test_left_shift<33>(); + test_left_shift<63>(); + test_left_shift<64>(); + test_left_shift<65>(); + test_left_shift<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_left_shift<0>(), ""); + static_assert(test_left_shift<1>(), ""); + static_assert(test_left_shift<31>(), ""); + static_assert(test_left_shift<32>(), ""); + static_assert(test_left_shift<33>(), ""); + static_assert(test_left_shift<63, 0, 3>(), ""); + static_assert(test_left_shift<63, 3, 6>(), ""); + static_assert(test_left_shift<63, 6, 9>(), ""); + static_assert(test_left_shift<63, 9>(), ""); + static_assert(test_left_shift<64, 0, 3>(), ""); + static_assert(test_left_shift<64, 3, 6>(), ""); + static_assert(test_left_shift<64, 6, 9>(), ""); + static_assert(test_left_shift<64, 9>(), ""); + static_assert(test_left_shift<65, 0, 3>(), ""); + static_assert(test_left_shift<65, 3, 6>(), ""); + static_assert(test_left_shift<65, 6, 9>(), ""); + static_assert(test_left_shift<65, 9>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp new file mode 100644 index 0000000000..4de9bf340e --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bool none() const; // constexpr since C++23 + +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_none() +{ + cuda::std::bitset v; + v.reset(); + assert(v.none() == true); + v.set(); + assert(v.none() == (N == 0)); + if (v.size() > 1) + { + v[N / 2] = false; + assert(v.none() == false); + v.reset(); + v[N / 2] = true; + assert(v.none() == false); + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_none<0>(); + test_none<1>(); + test_none<31>(); + test_none<32>(); + test_none<33>(); + test_none<63>(); + test_none<64>(); + test_none<65>(); + test_none<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp new file mode 100644 index 0000000000..5a268391a2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset operator~() const; // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_not_all() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset v1(cases[c]); + cuda::std::bitset v2 = ~v1; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v2[i] == ~v1[i]); + } + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_not_all<0>(); + test_not_all<1>(); + test_not_all<31>(); + test_not_all<32>(); + test_not_all<33>(); + test_not_all<63>(); + test_not_all<64>(); + test_not_all<65>(); + + return true; +} + +int main(int, char**) +{ + test(); + test_not_all<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp new file mode 100644 index 0000000000..edb4f4512a --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& operator&=(const bitset& rhs); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_and_eq() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1) + { + for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2) + { + cuda::std::bitset v1(cases[c1]); + cuda::std::bitset v2(cases[c2]); + cuda::std::bitset v3 = v1; + v1 &= v2; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v1[i] == (v3[i] && v2[i])); + } + } + } + } + + return true; +} + +int main(int, char**) +{ + test_op_and_eq<0>(); + test_op_and_eq<1>(); + test_op_and_eq<31>(); + test_op_and_eq<32>(); + test_op_and_eq<33>(); + test_op_and_eq<63>(); + test_op_and_eq<64>(); + test_op_and_eq<65>(); + test_op_and_eq<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_op_and_eq<0>(), ""); + static_assert(test_op_and_eq<1>(), ""); + static_assert(test_op_and_eq<31>(), ""); + static_assert(test_op_and_eq<32>(), ""); + static_assert(test_op_and_eq<33>(), ""); + static_assert(test_op_and_eq<63>(), ""); + static_assert(test_op_and_eq<64>(), ""); + static_assert(test_op_and_eq<65>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp new file mode 100644 index 0000000000..4eb828b040 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// test: + +// bool operator==(const bitset& rhs) const; // constexpr since C++23 +// bool operator!=(const bitset& rhs) const; // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_equality() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset const v1(cases[c]); + cuda::std::bitset v2 = v1; + assert(v1 == v2); + if (v1.size() > 0) + { + v2[N / 2].flip(); + assert(v1 != v2); + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_equality<0>(); + test_equality<1>(); + test_equality<31>(); + test_equality<32>(); + test_equality<33>(); + test_equality<63>(); + test_equality<64>(); + test_equality<65>(); + + return true; +} + +int main(int, char**) +{ + test(); + test_equality<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp new file mode 100644 index 0000000000..995aed4c7a --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000 + +// bitset& operator|=(const bitset& rhs); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template (-1)> +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_or_eq() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + if (Start != 0) + { + assert(End >= cases.size()); + } + for (cuda::std::size_t c1 = Start; c1 != cases.size() && c1 != End; ++c1) + { + for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2) + { + cuda::std::bitset v1(cases[c1]); + cuda::std::bitset v2(cases[c2]); + cuda::std::bitset v3 = v1; + v1 |= v2; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v1[i] == (v3[i] || v2[i])); + } + } + } + } + + return true; +} + +int main(int, char**) +{ + test_op_or_eq<0>(); + test_op_or_eq<1>(); + test_op_or_eq<31>(); + test_op_or_eq<32>(); + test_op_or_eq<33>(); + test_op_or_eq<63>(); + test_op_or_eq<64>(); + test_op_or_eq<65>(); + test_op_or_eq<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_op_or_eq<0>(), ""); + static_assert(test_op_or_eq<1>(), ""); + static_assert(test_op_or_eq<31>(), ""); + static_assert(test_op_or_eq<32>(), ""); + static_assert(test_op_or_eq<33>(), ""); + static_assert(test_op_or_eq<63>(), ""); + static_assert(test_op_or_eq<64>(), ""); + static_assert(test_op_or_eq<65, 0, 6>(), ""); + static_assert(test_op_or_eq<65, 6>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp new file mode 100644 index 0000000000..5db92124e8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& operator^=(const bitset& rhs); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template (-1)> +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_xor_eq() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + if (Start >= 9) + { + assert(End >= cases.size()); + } + for (cuda::std::size_t c1 = Start; c1 != cases.size() && c1 != End; ++c1) + { + for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2) + { + cuda::std::bitset v1(cases[c1]); + cuda::std::bitset v2(cases[c2]); + cuda::std::bitset v3 = v1; + v1 ^= v2; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v1[i] == (v3[i] != v2[i])); + } + } + } + } + return true; +} + +int main(int, char**) +{ + test_op_xor_eq<0>(); + test_op_xor_eq<1>(); + test_op_xor_eq<31>(); + test_op_xor_eq<32>(); + test_op_xor_eq<33>(); + test_op_xor_eq<63>(); + test_op_xor_eq<64>(); + test_op_xor_eq<65>(); + test_op_xor_eq<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_op_xor_eq<0>(), ""); + static_assert(test_op_xor_eq<1>(), ""); + static_assert(test_op_xor_eq<31>(), ""); + static_assert(test_op_xor_eq<32>(), ""); + static_assert(test_op_xor_eq<33>(), ""); + static_assert(test_op_xor_eq<63, 0, 6>(), ""); + static_assert(test_op_xor_eq<63, 6>(), ""); + static_assert(test_op_xor_eq<64, 0, 6>(), ""); + static_assert(test_op_xor_eq<64, 6>(), ""); + static_assert(test_op_xor_eq<65, 0, 6>(), ""); + static_assert(test_op_xor_eq<65, 6>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp new file mode 100644 index 0000000000..e7f3ba1fed --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& reset(); // constexpr since C++23 + +#include +#include +#include + +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_reset_all() +{ + cuda::std::bitset v; + v.set(); + v.reset(); + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(!v[i]); + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_reset_all<0>(); + test_reset_all<1>(); + test_reset_all<31>(); + test_reset_all<32>(); + test_reset_all<33>(); + test_reset_all<63>(); + test_reset_all<64>(); + test_reset_all<65>(); + test_reset_all<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp new file mode 100644 index 0000000000..787dedc2d4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-no-exceptions + +// bitset& reset(size_t pos); // constexpr since C++23 + +// Make sure we throw ::std::out_of_range when calling reset() on an OOB index. + +#include +#include + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + { + cuda::std::bitset<0> v; + try + { + v.reset(0); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<1> v("0"); + try + { + v.reset(2); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<10> v("0000000000"); + try + { + v.reset(10); + assert(false); + } + catch (::std::out_of_range const&) + {} + }) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp new file mode 100644 index 0000000000..f5f8ff8838 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& reset(size_t pos); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +// TEST_MSVC_DIAGNOSTIC_IGNORED(6294) // Ill-defined for-loop: initial condition does not satisfy test. Loop body not +// executed. + +_CCCL_NV_DIAG_SUPPRESS(186) + +template (-1)> +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_reset_one() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + if (Start >= 9) + { + assert(End >= cases.size()); + } + for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c) + { + for (cuda::std::size_t i = 0; i != N; ++i) + { + cuda::std::bitset v(cases[c]); + v.reset(i); + assert(v[i] == false); + } + } + + return true; +} + +int main(int, char**) +{ + test_reset_one<0>(); + test_reset_one<1>(); + test_reset_one<31>(); + test_reset_one<32>(); + test_reset_one<33>(); + test_reset_one<63>(); + test_reset_one<64>(); + test_reset_one<65>(); + test_reset_one<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_reset_one<0>(), ""); + static_assert(test_reset_one<1>(), ""); + static_assert(test_reset_one<31>(), ""); + static_assert(test_reset_one<32>(), ""); + static_assert(test_reset_one<33>(), ""); + static_assert(test_reset_one<63, 0, 6>(), ""); + static_assert(test_reset_one<63, 6>(), ""); + static_assert(test_reset_one<64, 0, 3>(), ""); + static_assert(test_reset_one<64, 3, 6>(), ""); + static_assert(test_reset_one<64, 6, 9>(), ""); + static_assert(test_reset_one<64, 9>(), ""); + static_assert(test_reset_one<65, 0, 3>(), ""); + static_assert(test_reset_one<65, 3, 6>(), ""); + static_assert(test_reset_one<65, 6, 9>(), ""); + static_assert(test_reset_one<65, 9>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp new file mode 100644 index 0000000000..5a4f351c7b --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset operator>>(size_t pos) const; // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template (-1)> +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_right_shift() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + if (Start >= 9) + { + assert(End >= cases.size()); + } + for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c) + { + for (cuda::std::size_t s = 0; s <= N + 1; ++s) + { + cuda::std::bitset v1(cases[c]); + cuda::std::bitset v2 = v1; + assert((v1 >>= s) == (v2 >> s)); + } + } + return true; +} + +__host__ __device__ int main(int, char**) +{ + test_right_shift<0>(); + test_right_shift<1>(); + test_right_shift<31>(); + test_right_shift<32>(); + test_right_shift<33>(); + test_right_shift<63>(); + test_right_shift<64>(); + test_right_shift<65>(); + test_right_shift<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_right_shift<0>(), ""); + static_assert(test_right_shift<1>(), ""); + static_assert(test_right_shift<31>(), ""); + static_assert(test_right_shift<32>(), ""); + static_assert(test_right_shift<33>(), ""); + static_assert(test_right_shift<63, 0, 6>(), ""); + static_assert(test_right_shift<63, 6>(), ""); + static_assert(test_right_shift<64, 0, 6>(), ""); + static_assert(test_right_shift<64, 6>(), ""); + static_assert(test_right_shift<65, 0, 3>(), ""); + static_assert(test_right_shift<65, 3, 6>(), ""); + static_assert(test_right_shift<65, 6, 9>(), ""); + static_assert(test_right_shift<65, 9>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp new file mode 100644 index 0000000000..a4e8327b70 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000 + +// bitset& operator<<=(size_t pos); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template (-1)> +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_right_shift() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + if (Start >= 9) + { + assert(End >= cases.size()); + } + for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c) + { + for (cuda::std::size_t s = 0; s <= N + 1; ++s) + { + cuda::std::bitset v1(cases[c]); + cuda::std::bitset v2 = v1; + v1 >>= s; + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v1.size(); ++i) + { + if (i + s < v1.size()) + { + _CCCL_DIAG_POP + { + assert(v1[i] == v2[i + s]); + } + } + else + { + assert(v1[i] == 0); + } + } + } + } + return true; +} + +__host__ __device__ int main(int, char**) +{ + test_right_shift<0>(); + test_right_shift<1>(); + test_right_shift<31>(); + test_right_shift<32>(); + test_right_shift<33>(); + test_right_shift<63>(); + test_right_shift<64>(); + test_right_shift<65>(); + test_right_shift<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_right_shift<0>(), ""); + static_assert(test_right_shift<1>(), ""); + static_assert(test_right_shift<31>(), ""); + static_assert(test_right_shift<32>(), ""); + static_assert(test_right_shift<33>(), ""); + static_assert(test_right_shift<63, 0, 3>(), ""); + static_assert(test_right_shift<63, 3, 6>(), ""); + static_assert(test_right_shift<63, 6, 9>(), ""); + static_assert(test_right_shift<63, 9>(), ""); + static_assert(test_right_shift<64, 0, 3>(), ""); + static_assert(test_right_shift<64, 3, 6>(), ""); + static_assert(test_right_shift<64, 6, 9>(), ""); + static_assert(test_right_shift<64, 9>(), ""); + static_assert(test_right_shift<65, 0, 3>(), ""); + static_assert(test_right_shift<65, 3, 6>(), ""); + static_assert(test_right_shift<65, 6, 9>(), ""); + static_assert(test_right_shift<65, 9>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp new file mode 100644 index 0000000000..f08ff34b5b --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& set(); // constexpr since C++23 + +#include +#include +#include + +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_set_all() +{ + cuda::std::bitset v; + v.set(); + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_ICC(186) + for (cuda::std::size_t i = 0; i < v.size(); ++i) + { + _CCCL_DIAG_POP + { + assert(v[i]); + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_set_all<0>(); + test_set_all<1>(); + test_set_all<31>(); + test_set_all<32>(); + test_set_all<33>(); + test_set_all<63>(); + test_set_all<64>(); + test_set_all<65>(); + test_set_all<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp new file mode 100644 index 0000000000..810f9210d8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-no-exceptions + +// bitset& set(size_t pos, bool val = true); // constexpr since C++23 + +// Make sure we throw ::std::out_of_range when calling set() on an OOB index. + +#include +#include + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + { + cuda::std::bitset<0> v; + try + { + v.set(0); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<1> v("0"); + try + { + v.set(2); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<10> v("0000000000"); + try + { + v.set(10); + assert(false); + } + catch (::std::out_of_range const&) + {} + }) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp new file mode 100644 index 0000000000..b619250c02 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset& set(size_t pos, bool val = true); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_set_one() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset v(cases[c]); + if (v.size() > 0) + { + cuda::std::size_t middle = v.size() / 2; + v.set(middle); + assert(v[middle] == true); + v.set(middle, false); + assert(v[middle] == false); + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_set_one<0>(); + test_set_one<1>(); + test_set_one<31>(); + test_set_one<32>(); + test_set_one<33>(); + test_set_one<63>(); + test_set_one<64>(); + test_set_one<65>(); + + return true; +} + +int main(int, char**) +{ + test(); + test_set_one<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp new file mode 100644 index 0000000000..0c65c13631 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// size_t count() const; // constexpr since C++23 + +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_size() +{ + const cuda::std::bitset v; + assert(v.size() == N); +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_size<0>(); + test_size<1>(); + test_size<31>(); + test_size<32>(); + test_size<33>(); + test_size<63>(); + test_size<64>(); + test_size<65>(); + test_size<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp new file mode 100644 index 0000000000..efd0195344 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-no-exceptions + +// constexpr bool test(size_t pos) const; + +// Make sure we throw cuda::std::out_of_range when calling test() on an OOB index. + +#include +#include + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + { + cuda::std::bitset<0> v; + try + { + (void) v.test(0); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<1> v("0"); + try + { + (void) v.test(2); + assert(false); + } + catch (::std::out_of_range const&) + {} + } { + cuda::std::bitset<10> v("0000000000"); + try + { + (void) v.test(10); + assert(false); + } + catch (::std::out_of_range const&) + {} + }) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp new file mode 100644 index 0000000000..909e798ea8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bool test(size_t pos) const; // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +_CCCL_NV_DIAG_SUPPRESS(186) + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_test() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset const v(cases[c]); + if (v.size() > 0) + { + cuda::std::size_t middle = v.size() / 2; + bool b = v.test(middle); + assert(b == v[middle]); + } + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_test<0>(); + test_test<1>(); + test_test<31>(); + test_test<32>(); + test_test<33>(); + test_test<63>(); + test_test<64>(); + test_test<65>(); + + return true; +} + +int main(int, char**) +{ + test(); + test_test<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp new file mode 100644 index 0000000000..801a51535a --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp @@ -0,0 +1,185 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// test: + +// template +// basic_string +// to_string(charT zero = charT('0'), charT one = charT('1')) const; // constexpr since C++23 +// +// template +// basic_string > to_string() const; // constexpr since C++23 +// +// template +// basic_string, allocator > to_string() const; // constexpr since C++23 +// +// basic_string, allocator > to_string() const; // constexpr since C++23 + +#include + +#ifndef __LIBCUDACXX_HAS_STRING + +int main(int, char**) +{ + return 0; +} + +#else + +# include +# include +# include +# include // for cuda::std::allocator +# include +# include + +# include "../bitset_test_cases.h" +# include "test_macros.h" + +template +TEST_CONSTEXPR_CXX14 void +check_equal(cuda::std::basic_string const& s, cuda::std::bitset const& b, CharT zero, CharT one) +{ + assert(s.size() == b.size()); + for (cuda::std::size_t i = 0; i < b.size(); ++i) + { + if (b[i]) + { + assert(s[b.size() - 1 - i] == one); + } + else + { + assert(s[b.size() - 1 - i] == zero); + } + } +} + +template +TEST_CONSTEXPR_CXX14 bool test_to_string() +{ + cuda::std::vector> const cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset const v = cases[c]; + { + cuda::std::string s = v.template to_string(); + check_equal(s, v, '0', '1'); + } + { + cuda::std::string s = v.to_string(); + check_equal(s, v, '0', '1'); + } + { + cuda::std::string s = v.template to_string('0'); + check_equal(s, v, '0', '1'); + } + { + cuda::std::string s = v.to_string('0'); + check_equal(s, v, '0', '1'); + } + { + cuda::std::string s = v.template to_string('0', '1'); + check_equal(s, v, '0', '1'); + } + { + cuda::std::string s = v.to_string('0', '1'); + check_equal(s, v, '0', '1'); + } + { + cuda::std::string s = v.to_string('x', 'y'); + check_equal(s, v, 'x', 'y'); + } + } + return true; +} + +# ifndef TEST_HAS_NO_WIDE_CHARACTERS +template +TEST_CONSTEXPR_CXX14 bool test_to_string_wchar() +{ + cuda::std::vector> const cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c = 0; c != cases.size(); ++c) + { + cuda::std::bitset const v = cases[c]; + { + cuda::std::wstring s = + v.template to_string, cuda::std::allocator>(); + check_equal(s, v, L'0', L'1'); + } + { + cuda::std::wstring s = v.template to_string>(); + check_equal(s, v, L'0', L'1'); + } + { + cuda::std::wstring s = + v.template to_string, cuda::std::allocator>('0'); + check_equal(s, v, L'0', L'1'); + } + { + cuda::std::wstring s = v.template to_string>('0'); + check_equal(s, v, L'0', L'1'); + } + { + cuda::std::wstring s = + v.template to_string, cuda::std::allocator>('0', '1'); + check_equal(s, v, L'0', L'1'); + } + { + cuda::std::wstring s = v.template to_string>('0', '1'); + check_equal(s, v, L'0', L'1'); + } + } + return true; +} +# endif + +int main(int, char**) +{ + test_to_string<0>(); + test_to_string<1>(); + test_to_string<31>(); + test_to_string<32>(); + test_to_string<33>(); + test_to_string<63>(); + test_to_string<64>(); + test_to_string<65>(); + test_to_string<1000>(); // not in constexpr because of constexpr evaluation step limits +# if TEST_STD_VER >= 2023 + static_assert(test_to_string<0>(), ""); + static_assert(test_to_string<1>(), ""); + static_assert(test_to_string<31>(), ""); + static_assert(test_to_string<32>(), ""); + static_assert(test_to_string<33>(), ""); + static_assert(test_to_string<63>(), ""); + static_assert(test_to_string<64>(), ""); + static_assert(test_to_string<65>(), ""); +# endif + +# ifndef TEST_HAS_NO_WIDE_CHARACTERS + test_to_string_wchar<0>(); + test_to_string_wchar<1>(); + test_to_string_wchar<31>(); + test_to_string_wchar<32>(); + test_to_string_wchar<33>(); + test_to_string_wchar<63>(); + test_to_string_wchar<64>(); + test_to_string_wchar<65>(); + test_to_string_wchar<1000>(); // not in constexpr because of constexpr evaluation step limits +# if TEST_STD_VER >= 2023 + static_assert(test_to_string_wchar<0>(), ""); + static_assert(test_to_string_wchar<1>(), ""); + static_assert(test_to_string_wchar<31>(), ""); + static_assert(test_to_string_wchar<32>(), ""); + static_assert(test_to_string_wchar<33>(), ""); + static_assert(test_to_string_wchar<63>(), ""); +# endif +# endif + return 0; +} + +#endif diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp new file mode 100644 index 0000000000..e76cfd4aed --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// unsigned long long to_ullong() const; // constexpr since C++23 + +#include +// #include +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_to_ullong() +{ + const cuda::std::size_t M = sizeof(unsigned long long) * CHAR_BIT < N ? sizeof(unsigned long long) * CHAR_BIT : N; + const bool is_M_zero = cuda::std::integral_constant::value; // avoid compiler warnings + const cuda::std::size_t X = + is_M_zero ? sizeof(unsigned long long) * CHAR_BIT - 1 : sizeof(unsigned long long) * CHAR_BIT - M; + const unsigned long long max = is_M_zero ? 0 : (unsigned long long) (-1) >> X; + unsigned long long tests[] = { + 0, + cuda::std::min(1, max), + cuda::std::min(2, max), + cuda::std::min(3, max), + cuda::std::min(max, max - 3), + cuda::std::min(max, max - 2), + cuda::std::min(max, max - 1), + max}; + for (unsigned long long j : tests) + { + cuda::std::bitset v(j); + assert(j == v.to_ullong()); + } + { // test values bigger than can fit into the bitset + const unsigned long long val = 0x55AAAAFFFFAAAA55ULL; + const bool canFit = N < sizeof(unsigned long long) * CHAR_BIT; + const unsigned long long mask = canFit ? (1ULL << (canFit ? N : 0)) - 1 : (unsigned long long) (-1); // avoid + // compiler + // warnings + cuda::std::bitset v(val); + assert(v.to_ullong() == (val & mask)); // we shouldn't return bit patterns from outside the limits of the bitset. + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_to_ullong<0>(); + test_to_ullong<1>(); + test_to_ullong<31>(); + test_to_ullong<32>(); + test_to_ullong<33>(); + test_to_ullong<63>(); + test_to_ullong<64>(); + test_to_ullong<65>(); + test_to_ullong<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 && (!defined(_CCCL_CUDACC_BELOW_11_8) || !defined(_CCCL_COMPILER_MSVC)) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp new file mode 100644 index 0000000000..a4400a59fc --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// unsigned long to_ulong() const; // constexpr since C++23 + +#include +// #include +#include +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_to_ulong() +{ + const cuda::std::size_t M = sizeof(unsigned long) * CHAR_BIT < N ? sizeof(unsigned long) * CHAR_BIT : N; + const bool is_M_zero = cuda::std::integral_constant::value; // avoid compiler warnings + const cuda::std::size_t X = is_M_zero ? sizeof(unsigned long) * CHAR_BIT - 1 : sizeof(unsigned long) * CHAR_BIT - M; + const cuda::std::size_t max = is_M_zero ? 0 : cuda::std::size_t(cuda::std::numeric_limits::max()) >> X; + cuda::std::size_t tests[] = { + 0, + cuda::std::min(1, max), + cuda::std::min(2, max), + cuda::std::min(3, max), + cuda::std::min(max, max - 3), + cuda::std::min(max, max - 2), + cuda::std::min(max, max - 1), + max}; + for (cuda::std::size_t j : tests) + { + cuda::std::bitset v(j); + assert(j == v.to_ulong()); + } + + { // test values bigger than can fit into the bitset + const unsigned long val = 0x5AFFFFA5UL; + const bool canFit = N < sizeof(unsigned long) * CHAR_BIT; + const unsigned long mask = canFit ? (1UL << (canFit ? N : 0)) - 1 : (unsigned long) (-1); // avoid compiler warnings + cuda::std::bitset v(val); + assert(v.to_ulong() == (val & mask)); // we shouldn't return bit patterns from outside the limits of the bitset. + } +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_to_ulong<0>(); + test_to_ulong<1>(); + test_to_ulong<31>(); + test_to_ulong<32>(); + test_to_ulong<33>(); + test_to_ulong<63>(); + test_to_ulong<64>(); + test_to_ulong<65>(); + test_to_ulong<1000>(); + + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 && (!defined(_CCCL_CUDACC_BELOW_11_8) || !defined(_CCCL_COMPILER_MSVC)) + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp new file mode 100644 index 0000000000..c47ae6aeb9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset operator&(const bitset& lhs, const bitset& rhs); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_and() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1) + { + for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2) + { + cuda::std::bitset v1(cases[c1]); + cuda::std::bitset v2(cases[c2]); + cuda::std::bitset v3 = v1; + assert((v1 & v2) == (v3 &= v2)); + } + } + + return true; +} + +int main(int, char**) +{ + test_op_and<0>(); + test_op_and<1>(); + test_op_and<31>(); + test_op_and<32>(); + test_op_and<33>(); + test_op_and<63>(); + test_op_and<64>(); + test_op_and<65>(); + test_op_and<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_op_and<0>(), ""); + static_assert(test_op_and<1>(), ""); + static_assert(test_op_and<31>(), ""); + static_assert(test_op_and<32>(), ""); + static_assert(test_op_and<33>(), ""); + static_assert(test_op_and<63>(), ""); + static_assert(test_op_and<64>(), ""); + static_assert(test_op_and<65>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp new file mode 100644 index 0000000000..3b2562c417 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset operator^(const bitset& lhs, const bitset& rhs); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_not() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1) + { + for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2) + { + cuda::std::bitset v1(cases[c1]); + cuda::std::bitset v2(cases[c2]); + cuda::std::bitset v3 = v1; + assert((v1 ^ v2) == (v3 ^= v2)); + } + } + + return true; +} + +int main(int, char**) +{ + test_op_not<0>(); + test_op_not<1>(); + test_op_not<31>(); + test_op_not<32>(); + test_op_not<33>(); + test_op_not<63>(); + test_op_not<64>(); + test_op_not<65>(); + test_op_not<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_op_not<0>(), ""); + static_assert(test_op_not<1>(), ""); + static_assert(test_op_not<31>(), ""); + static_assert(test_op_not<32>(), ""); + static_assert(test_op_not<33>(), ""); + static_assert(test_op_not<63>(), ""); + static_assert(test_op_not<64>(), ""); + static_assert(test_op_not<65>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp new file mode 100644 index 0000000000..5eb50ae733 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bitset operator|(const bitset& lhs, const bitset& rhs); // constexpr since C++23 + +#include +#include +#include + +#include "../bitset_test_cases.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_or() +{ + auto const& cases = get_test_cases(cuda::std::integral_constant()); + for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1) + { + for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2) + { + cuda::std::bitset v1(cases[c1]); + cuda::std::bitset v2(cases[c2]); + cuda::std::bitset v3 = v1; + assert((v1 | v2) == (v3 |= v2)); + } + } + + return true; +} + +int main(int, char**) +{ + test_op_or<0>(); + test_op_or<1>(); + test_op_or<31>(); + test_op_or<32>(); + test_op_or<33>(); + test_op_or<63>(); + test_op_or<64>(); + test_op_or<65>(); + test_op_or<1000>(); // not in constexpr because of constexpr evaluation step limits +// 11.4 added support for constexpr device vars needed here +#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4) + static_assert(test_op_or<0>(), ""); + static_assert(test_op_or<1>(), ""); + static_assert(test_op_or<31>(), ""); + static_assert(test_op_or<32>(), ""); + static_assert(test_op_or<33>(), ""); + static_assert(test_op_or<63>(), ""); + static_assert(test_op_or<64>(), ""); + static_assert(test_op_or<65>(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp new file mode 100644 index 0000000000..45ffa4bb59 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp @@ -0,0 +1,100 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-localization + +// test: + +// template +// basic_istream& +// operator>>(basic_istream& is, bitset& x); + +#include + +#ifndef _LIBCUDACXX_HAS_SSTREAM +int main(int, char**) +{ + return 0; +} +#else + +# include +# include +# include + +# include "test_macros.h" + +int main(int, char**) +{ + { + cuda::std::istringstream in("01011010"); + cuda::std::bitset<8> b; + in >> b; + assert(b.to_ulong() == 0x5A); + } + { + // Make sure that input-streaming an empty bitset does not cause the + // failbit to be set (LWG 3199). + cuda::std::istringstream in("01011010"); + cuda::std::bitset<0> b; + in >> b; + assert(b.to_string() == ""); + assert(!in.bad()); + assert(!in.fail()); + assert(!in.eof()); + assert(in.good()); + } +# ifndef TEST_HAS_NO_EXCEPTIONS + { + cuda::std::stringbuf sb; + cuda::std::istream is(&sb); + is.exceptions(cuda::std::ios::failbit); + + bool threw = false; + try + { + cuda::std::bitset<8> b; + is >> b; + } + catch (cuda::std::ios::failure const&) + { + threw = true; + } + + assert(!is.bad()); + assert(is.fail()); + assert(is.eof()); + assert(threw); + } + { + cuda::std::stringbuf sb; + cuda::std::istream is(&sb); + is.exceptions(cuda::std::ios::eofbit); + + bool threw = false; + try + { + cuda::std::bitset<8> b; + is >> b; + } + catch (cuda::std::ios::failure const&) + { + threw = true; + } + + assert(!is.bad()); + assert(is.fail()); + assert(is.eof()); + assert(threw); + } +# endif // TEST_HAS_NO_EXCEPTIONS + + return 0; +} + +#endif diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp new file mode 100644 index 0000000000..10c7392b95 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp @@ -0,0 +1,42 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-localization + +// test: + +// template +// basic_ostream& +// operator<<(basic_ostream& os, const bitset& x); + +#include + +#ifndef _LIBCUDACXX_HAS_SSTREAM +int main(int, char**) +{ + return 0; +} +#else + +# include +# include +# include + +# include "test_macros.h" + +int main(int, char**) +{ + cuda::std::ostringstream os; + cuda::std::bitset<8> b(0x5A); + os << b; + assert(os.str() == "01011010"); + + return 0; +} + +#endif diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h new file mode 100644 index 0000000000..8351e87795 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h @@ -0,0 +1,163 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef LIBCUDACXX_TEST_BITSET_TEST_CASES_H +#define LIBCUDACXX_TEST_BITSET_TEST_CASES_H + +#include +#include +#include + +#include "template_cost_testing.h" // for base cases of REPEAT_* +#include "test_macros.h" + +#if TEST_STD_VER == 2011 +# define BITSET_TEST_CONSTEXPR const +#else +# define BITSET_TEST_CONSTEXPR TEST_CONSTEXPR_GLOBAL +#endif + +#define NUMARGS(...) (::cuda::std::tuple_size::value) +#define DEFINE_CASES(N, ...) \ + __host__ __device__ BITSET_TEST_CONSTEXPR cuda::std::array get_test_cases( \ + cuda::std::integral_constant) \ + { \ + return {{__VA_ARGS__}}; \ + } + +DEFINE_CASES(0, "") + +DEFINE_CASES(1, "0", "1") + +DEFINE_CASES(2, "00", "01", "10", "11") + +DEFINE_CASES( + 31, + "0000000000000000000000000000000", + "0000000000000000000000000000001", + "1000000000000000000000000000000", + "1000000000000000000000000000001", + "1000000000000000000001000000001", + "0000000000000000111111111111111", + "1000000000000000111111111111111", + "1111111111111111000000000000000", + "1111111111111111000000000000001", + "1010101010101010101010101010101", + "0101010101010101010101010101010", + "1111111111111111111111111111111") + +DEFINE_CASES( + 32, + "00000000000000000000000000000000", + "00000000000000000000000000000001", + "10000000000000000000000000000000", + "10000000000000000000000000000001", + "10000000000000000000111000000001", + "00000000000000001111111111111111", + "10000000000000001111111111111111", + "11111111111111110000000000000000", + "11111111111111110000000000000001", + "10101010101010101010101010101010", + "01010101010101010101010101010101", + "11111111111111111111111111111111") + +DEFINE_CASES( + 33, + "000000000000000000000000000000000", + "000000000000000000000000000000001", + "100000000000000000000000000000000", + "100000000000000000000000000000001", + "100000000000000000001110000000001", + "000000000000000011111111111111111", + "100000000000000011111111111111111", + "111111111111111100000000000000000", + "111111111111111100000000000000001", + "101010101010101010101010101010101", + "010101010101010101010101010101010", + "111111111111111111111111111111111") + +DEFINE_CASES( + 63, + "000000000000000000000000000000000000000000000000000000000000000", + "000000000000000000000000000000000000000000000000000000000000001", + "100000000000000000000000000000000000000000000000000000000000000", + "100000000000000000000000000000000000000000000000000000000000001", + "100000000000000000000000001111100000000000000000000000000000001", + "000000000000000000000000000000001111111111111111111111111111111", + "100000000000000000000000000000001111111111111111111111111111111", + "111111111111111111111111111111110000000000000000000000000000000", + "111111111111111111111111111111110000000000000000000000000000001", + "101010101010101010101010101010101010101010101010101010101010101", + "010101010101010101010101010101010101010101010101010101010101010", + "111111111111111111111111111111111111111111111111111111111111111") + +DEFINE_CASES( + 64, + "0000000000000000000000000000000000000000000000000000000000000000", + "0000000000000000000000000000000000000000000000000000000000000001", + "1000000000000000000000000000000000000000000000000000000000000000", + "1000000000000000000000000000000000000000000000000000000000000001", + "1000000000000000000000000011111000000000000000000000000000000001", + "0000000000000000000000000000000011111111111111111111111111111111", + "1000000000000000000000000000000011111111111111111111111111111111", + "1111111111111111111111111111111100000000000000000000000000000000", + "1111111111111111111111111111111100000000000000000000000000000001", + "1010101010101010101010101010101010101010101010101010101010101010", + "0101010101010101010101010101010101010101010101010101010101010101", + "1111111111111111111111111111111111111111111111111111111111111111") + +DEFINE_CASES( + 65, + "00000000000000000000000000000000000000000000000000000000000000000", + "00000000000000000000000000000000000000000000000000000000000000001", + "10000000000000000000000000000000000000000000000000000000000000000", + "10000000000000000000000000000000000000000000000000000000000000001", + "10000000000000000000000000011111000000000000000000000000000000001", + "00000000000000000000000000000000011111111111111111111111111111111", + "10000000000000000000000000000000011111111111111111111111111111111", + "11111111111111111111111111111111000000000000000000000000000000000", + "11111111111111111111111111111111000000000000000000000000000000001", + "10101010101010101010101010101010101010101010101010101010101010101", + "01010101010101010101010101010101010101010101010101010101010101010", + "11111111111111111111111111111111111111111111111111111111111111111") + +#define BITSET_ZERO() "0" +#define BITSET_ONE() "1" +#define BITSET_ONEZERO() "10" +#define BITSET_ZEROONE() "10" + +#define REPEAT_8(DO_IT) DO_IT() DO_IT() DO_IT() DO_IT() DO_IT() DO_IT() DO_IT() DO_IT() +#define REPEAT_9(DO_IT) REPEAT_8(DO_IT) DO_IT() +#define REPEAT_90(DO_IT) \ + REPEAT_10(DO_IT) \ + REPEAT_10(DO_IT) \ + REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) +#define REPEAT_99(DO_IT) REPEAT_90(DO_IT) REPEAT_9(DO_IT) +#define REPEAT_400(DO_IT) REPEAT_100(DO_IT) REPEAT_100(DO_IT) REPEAT_100(DO_IT) REPEAT_100(DO_IT) +#define REPEAT_499(DO_IT) REPEAT_400(DO_IT) REPEAT_99(DO_IT) +#define REPEAT_900(DO_IT) REPEAT_500(DO_IT) REPEAT_400(DO_IT) +#define REPEAT_998(DO_IT) REPEAT_900(DO_IT) REPEAT_90(DO_IT) REPEAT_8(DO_IT) +#define REPEAT_999(DO_IT) REPEAT_900(DO_IT) REPEAT_99(DO_IT) + +DEFINE_CASES( + 1000, + REPEAT_1000(BITSET_ZERO), + REPEAT_999(BITSET_ZERO) BITSET_ONE(), + BITSET_ONE() REPEAT_999(BITSET_ZERO), + BITSET_ONE() REPEAT_998(BITSET_ZERO) BITSET_ONE(), + BITSET_ONE() REPEAT_400(BITSET_ZERO) REPEAT_99(BITSET_ONE) REPEAT_499(BITSET_ZERO) BITSET_ONE(), + REPEAT_500(BITSET_ZERO) REPEAT_500(BITSET_ONE), + BITSET_ONE() REPEAT_499(BITSET_ZERO) REPEAT_500(BITSET_ONE), + REPEAT_500(BITSET_ONE) REPEAT_500(BITSET_ZERO), + REPEAT_500(BITSET_ONE) REPEAT_499(BITSET_ZERO) BITSET_ONE(), + REPEAT_500(BITSET_ONEZERO), + REPEAT_500(BITSET_ZEROONE), + REPEAT_1000(BITSET_ONE)) + +#endif // !LIBCUDACXX_TEST_BITSET_TEST_CASES_H diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp new file mode 100644 index 0000000000..6841824725 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// test that includes and + +#include + +#include "test_macros.h" + +template +__host__ __device__ void test_typedef() +{} + +int main(int, char**) +{ +#ifdef _LIBCUDACXX_HAS_STRING + { // test for + cuda::std::string s; + ((void) s); + } +#endif + { // test for + test_typedef(); + test_typedef(); + test_typedef(); + test_typedef(); + } + + return 0; +} diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h index a066348d05..449f5fbbc2 100644 --- a/libcudacxx/test/support/test_macros.h +++ b/libcudacxx/test/support/test_macros.h @@ -4,6 +4,7 @@ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// From 2e44b2c394c55740086132d83d7b31f92e62dd95 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 14 Aug 2024 10:26:01 +0200 Subject: [PATCH 22/33] Refactor placeholder operators (#2233) --- .../functional_placeholders_miscellaneous.cu | 15 + thrust/thrust/detail/functional/actor.h | 198 +++++++--- thrust/thrust/detail/functional/actor.inl | 94 ----- thrust/thrust/detail/functional/argument.h | 76 ---- thrust/thrust/detail/functional/composite.h | 123 ------ thrust/thrust/detail/functional/operators.h | 367 +++++++++++++++++- .../operators/arithmetic_operators.h | 263 ------------- .../operators/assignment_operator.h | 76 ---- .../functional/operators/bitwise_operators.h | 195 ---------- .../operators/compound_assignment_operators.h | 319 --------------- .../functional/operators/logical_operators.h | 94 ----- .../functional/operators/operator_adaptors.h | 126 ------ .../operators/relational_operators.h | 177 --------- thrust/thrust/detail/functional/placeholder.h | 45 --- thrust/thrust/detail/functional/value.h | 80 ---- thrust/thrust/functional.h | 2 +- 16 files changed, 516 insertions(+), 1734 deletions(-) delete mode 100644 thrust/thrust/detail/functional/actor.inl delete mode 100644 thrust/thrust/detail/functional/argument.h delete mode 100644 thrust/thrust/detail/functional/composite.h delete mode 100644 thrust/thrust/detail/functional/operators/arithmetic_operators.h delete mode 100644 thrust/thrust/detail/functional/operators/assignment_operator.h delete mode 100644 thrust/thrust/detail/functional/operators/bitwise_operators.h delete mode 100644 thrust/thrust/detail/functional/operators/compound_assignment_operators.h delete mode 100644 thrust/thrust/detail/functional/operators/logical_operators.h delete mode 100644 thrust/thrust/detail/functional/operators/operator_adaptors.h delete mode 100644 thrust/thrust/detail/functional/operators/relational_operators.h delete mode 100644 thrust/thrust/detail/functional/placeholder.h delete mode 100644 thrust/thrust/detail/functional/value.h diff --git a/thrust/testing/functional_placeholders_miscellaneous.cu b/thrust/testing/functional_placeholders_miscellaneous.cu index fffc4f7e94..9362e81d72 100644 --- a/thrust/testing/functional_placeholders_miscellaneous.cu +++ b/thrust/testing/functional_placeholders_miscellaneous.cu @@ -79,3 +79,18 @@ VectorUnitTest TestFunctionalPlaceholdersTransformIteratorInstanceHost; + +template +struct TestFunctionalPlaceholdersArgumentValueCategories +{ + void operator()() const + { + using namespace thrust::placeholders; + auto expr = _1 * _1 + _2 * _2; + T a = 2; + T b = 3; + ASSERT_ALMOST_EQUAL(expr(2, 3), 13); // pass pr-value + ASSERT_ALMOST_EQUAL(expr(a, b), 13); // pass l-value + ASSERT_ALMOST_EQUAL(expr(::cuda::std::move(a), ::cuda::std::move(b)), 13); // pass x-value + } +}; diff --git a/thrust/thrust/detail/functional/actor.h b/thrust/thrust/detail/functional/actor.h index e76d67153a..79484aabbe 100644 --- a/thrust/thrust/detail/functional/actor.h +++ b/thrust/thrust/detail/functional/actor.h @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2024 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,103 +34,183 @@ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header -#include -#include -#include -#include +#include #include +#include #include +#include +#include + THRUST_NAMESPACE_BEGIN namespace detail { namespace functional { +// An actor is a node in an expression template +template +struct actor : Eval +{ + constexpr actor() = default; + + _CCCL_HOST_DEVICE actor(const Eval& base) + : Eval(base) + {} + + template + _CCCL_HOST_DEVICE auto operator()(Ts&&... ts) const -> decltype(Eval::eval(THRUST_FWD(ts)...)) + { + return Eval::eval(THRUST_FWD(ts)...); + } + + template + _CCCL_HOST_DEVICE auto operator=(const T& _1) const -> decltype(do_assign(*this, _1)) + { + return do_assign(*this, _1); + } +}; + +template +struct is_actor : ::cuda::std::false_type +{}; -// eval_ref is -// - T when T is a subclass of thrust::reference -// - T& otherwise -// This is used to let thrust::references pass through actor evaluations. template -using eval_ref = typename std::conditional::value, T, T&>::type; +struct is_actor> : ::cuda::std::true_type +{}; -template -struct apply_actor +// a node selecting and returning one of the arguments to the entire expression template +template +struct argument { - using type = typename Action::template result::type; + template + _CCCL_HOST_DEVICE auto + eval(Ts&&... args) const -> decltype(thrust::get(thrust::tuple{THRUST_FWD(args)...})) + { + return thrust::get(thrust::tuple{THRUST_FWD(args)...}); + } }; -template -struct actor : Eval +template +struct placeholder { - using eval_type = Eval; + using type = actor>; +}; - constexpr actor() = default; +// composition of actors/nodes +template +struct composite; - _CCCL_HOST_DEVICE actor(const Eval& base); +template +struct composite +{ + // TODO(bgruber): drop ctor and use aggregate initialization in C++17 + _CCCL_HOST_DEVICE composite(const Eval& eval, const SubExpr& subexpr) + : m_eval(eval) + , m_subexpr(subexpr) + {} template - _CCCL_HOST_DEVICE typename apply_actor...>>::type operator()(Ts&&... ts) const; + _CCCL_HOST_DEVICE auto eval(Ts&&... args) const + -> decltype(::cuda::std::declval().eval(::cuda::std::declval().eval(THRUST_FWD(args)...))) + { + return m_eval.eval(m_subexpr.eval(THRUST_FWD(args)...)); + } - template - _CCCL_HOST_DEVICE typename assign_result::type operator=(const T& _1) const; -}; // end actor +private: + Eval m_eval; + SubExpr m_subexpr; +}; -// in general, as_actor should turn things into values -template -struct as_actor +template +struct composite { - using type = value; + // TODO(bgruber): drop ctor and use aggregate initialization in C++17 + _CCCL_HOST_DEVICE composite(const Eval& eval, const SubExpr1& subexpr1, const SubExpr2& subexpr2) + : m_eval(eval) + , m_subexpr1(subexpr1) + , m_subexpr2(subexpr2) + {} - static inline _CCCL_HOST_DEVICE type convert(const T& x) + template + _CCCL_HOST_DEVICE auto eval(Ts&&... args) const + -> decltype(::cuda::std::declval().eval(::cuda::std::declval().eval(THRUST_FWD(args)...), + ::cuda::std::declval().eval(THRUST_FWD(args)...))) { - return val(x); - } // end convert() -}; // end as_actor + return m_eval.eval(m_subexpr1.eval(THRUST_FWD(args)...), m_subexpr2.eval(THRUST_FWD(args)...)); + } + +private: + Eval m_eval; + SubExpr1 m_subexpr1; + SubExpr2 m_subexpr2; +}; -// specialization for things which are already actors template -struct as_actor> +struct actor; + +// Adapts a transparent unary functor from functional.h (e.g. thrust::negate<>) into the Eval interface. +template +struct operator_adaptor : F { - using type = actor; + _CCCL_HOST_DEVICE operator_adaptor(F f) + : F(::cuda::std::move(f)) + {} - static inline _CCCL_HOST_DEVICE const type& convert(const actor& x) + template + _CCCL_HOST_DEVICE auto eval(Ts&&... args) const -> decltype(F{}(THRUST_FWD(args)...)) { - return x; - } // end convert() -}; // end as_actor + return static_cast(*this)(THRUST_FWD(args)...); + } +}; +// a node returning a fixed value template -typename as_actor::type _CCCL_HOST_DEVICE make_actor(const T& x) +struct value { - return as_actor::convert(x); -} // end make_actor() + T m_val; -} // namespace functional + template + _CCCL_HOST_DEVICE T eval(Ts&&...) const + { + return m_val; + } +}; -// provide specializations for result_of for nullary, unary, and binary invocations of actor -template -struct result_of_adaptable_function()> +template +_CCCL_HOST_DEVICE auto make_actor(T&& x) -> actor>> { - using type = - typename thrust::detail::functional::apply_actor, thrust::tuple<>>::type; -}; // end result_of + return {{THRUST_FWD(x)}}; +} -template -struct result_of_adaptable_function(Arg1)> +template +_CCCL_HOST_DEVICE auto make_actor(actor x) -> actor { - using type = - typename thrust::detail::functional::apply_actor, thrust::tuple>::type; -}; // end result_of + return x; +} -template -struct result_of_adaptable_function(Arg1, Arg2)> +template +_CCCL_HOST_DEVICE auto compose(Eval e, const SubExpr& subexpr) + -> decltype(actor, decltype(make_actor(subexpr))>>{ + {{::cuda::std::move(e)}, make_actor(subexpr)}}) { - using type = typename thrust::detail::functional::apply_actor, - thrust::tuple>::type; -}; // end result_of + return actor, decltype(make_actor(subexpr))>>{ + {{::cuda::std::move(e)}, make_actor(subexpr)}}; +} + +template +_CCCL_HOST_DEVICE auto compose(Eval e, const SubExpr1& subexpr1, const SubExpr2& subexpr2) + -> decltype(actor, decltype(make_actor(subexpr1)), decltype(make_actor(subexpr2))>>{ + {{::cuda::std::move(e)}, make_actor(subexpr1), make_actor(subexpr2)}}) +{ + return actor, decltype(make_actor(subexpr1)), decltype(make_actor(subexpr2))>>{ + {{::cuda::std::move(e)}, make_actor(subexpr1), make_actor(subexpr2)}}; +} +} // namespace functional +template +struct result_of_adaptable_function(Args...)> +{ + using type = decltype(::cuda::std::declval>()(::cuda::std::declval()...)); +}; } // namespace detail THRUST_NAMESPACE_END - -#include diff --git a/thrust/thrust/detail/functional/actor.inl b/thrust/thrust/detail/functional/actor.inl deleted file mode 100644 index 64d367ed15..0000000000 --- a/thrust/thrust/detail/functional/actor.inl +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Portions of this code are derived from -// -// Manjunath Kudlur's Carbon library -// -// and -// -// Based on Boost.Phoenix v1.2 -// Copyright (c) 2001-2002 Joel de Guzman - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include -#include - -#include - -THRUST_NAMESPACE_BEGIN - -namespace detail -{ -namespace functional -{ - -template -_CCCL_HOST_DEVICE actor::actor(const Eval& base) - : eval_type(base) -{} - -// actor::operator() needs to construct a tuple of references to its -// arguments. To make this work with thrust::reference, we need to -// detect thrust proxy references and store them as T rather than T&. -// This check ensures that the forwarding references passed into -// actor::operator() are either: -// - T&& if and only if T is a thrust::reference, or -// - T& for any other types. -// This struct provides a nicer diagnostic for when these conditions aren't -// met. -template -using actor_check_ref_type = - ::cuda::std::integral_constant::value || thrust::detail::is_wrapped_reference::value)>; - -template -using actor_check_ref_types = thrust::conjunction...>; - -template -template -_CCCL_HOST_DEVICE typename apply_actor::eval_type, thrust::tuple...>>::type -actor::operator()(Ts&&... ts) const -{ - static_assert(actor_check_ref_types::value, - "Actor evaluations only support rvalue references to " - "thrust::reference subclasses."); - using tuple_type = thrust::tuple...>; - return eval_type::eval(tuple_type(THRUST_FWD(ts)...)); -} // end actor::operator() - -template -template -_CCCL_HOST_DEVICE typename assign_result::type actor::operator=(const T& _1) const -{ - return do_assign(*this, _1); -} // end actor::operator=() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/argument.h b/thrust/thrust/detail/functional/argument.h deleted file mode 100644 index b4fb100e80..0000000000 --- a/thrust/thrust/detail/functional/argument.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Portions of this code are derived from -// -// Manjunath Kudlur's Carbon library -// -// and -// -// Based on Boost.Phoenix v1.2 -// Copyright (c) 2001-2002 Joel de Guzman - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -struct argument_helper -{ - using type = typename thrust::tuple_element::type; -}; - -template -struct argument_helper> -{ - using type = thrust::tuple<>; -}; - -template -class argument -{ -public: - template - struct result : argument_helper - {}; - - _CCCL_HOST_DEVICE constexpr argument() {} - - template - _CCCL_HOST_DEVICE typename result::type eval(const Env& e) const - { - return thrust::get(e); - } // end eval() -}; // end argument - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/composite.h b/thrust/thrust/detail/functional/composite.h deleted file mode 100644 index ad4c1c67af..0000000000 --- a/thrust/thrust/detail/functional/composite.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Portions of this code are derived from -// -// Manjunath Kudlur's Carbon library -// -// and -// -// Based on Boost.Phoenix v1.2 -// Copyright (c) 2001-2002 Joel de Guzman - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -class composite; - -template -class composite -{ -public: - template - struct result - { - using type = typename Eval0::template result::type>>::type; - }; - - _CCCL_HOST_DEVICE composite(const Eval0& e0, const Eval1& e1) - : m_eval0(e0) - , m_eval1(e1) - {} - - template - _CCCL_HOST_DEVICE typename result::type eval(const Env& x) const - { - typename Eval1::template result::type result1 = m_eval1.eval(x); - return m_eval0.eval(thrust::tie(result1)); - } - -private: - Eval0 m_eval0; - Eval1 m_eval1; -}; // end composite - -template -class composite -{ -public: - template - struct result - { - using type = typename Eval0::template result< - thrust::tuple::type, typename Eval2::template result::type>>::type; - }; - - _CCCL_HOST_DEVICE composite(const Eval0& e0, const Eval1& e1, const Eval2& e2) - : m_eval0(e0) - , m_eval1(e1) - , m_eval2(e2) - {} - - template - _CCCL_HOST_DEVICE typename result::type eval(const Env& x) const - { - typename Eval1::template result::type result1 = m_eval1.eval(x); - typename Eval2::template result::type result2 = m_eval2.eval(x); - return m_eval0.eval(thrust::tie(result1, result2)); - } - -private: - Eval0 m_eval0; - Eval1 m_eval1; - Eval2 m_eval2; -}; // end composite - -template -_CCCL_HOST_DEVICE actor> compose(const Eval0& e0, const Eval1& e1) -{ - return actor>(composite(e0, e1)); -} - -template -_CCCL_HOST_DEVICE actor> compose(const Eval0& e0, const Eval1& e1, const Eval2& e2) -{ - return actor>(composite(e0, e1, e2)); -} - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators.h b/thrust/thrust/detail/functional/operators.h index fe67ab7dd3..94347a82bc 100644 --- a/thrust/thrust/detail/functional/operators.h +++ b/thrust/thrust/detail/functional/operators.h @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2024 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,15 @@ * limitations under the License. */ +// Portions of this code are derived from +// +// Manjunath Kudlur's Carbon library +// +// and +// +// Based on Boost.Phoenix v1.2 +// Copyright (c) 2001-2002 Joel de Guzman + #pragma once #include @@ -25,8 +34,354 @@ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header -#include -#include -#include -#include -#include +#include +#include +#include + +#include + +THRUST_NAMESPACE_BEGIN +namespace detail +{ +namespace functional +{ +// there's no standard plus_equal functional, so roll an ad hoc one here +struct plus_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) += THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) += THRUST_FWD(t2)) + { + return THRUST_FWD(t1) += THRUST_FWD(t2); + } +}; + +// there's no standard minus_equal functional, so roll an ad hoc one here +struct minus_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) -= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) -= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) -= THRUST_FWD(t2); + } +}; + +// there's no standard multiplies_equal functional, so roll an ad hoc one here +struct multiplies_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) *= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) *= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) *= THRUST_FWD(t2); + } +}; + +// there's no standard divides_equal functional, so roll an ad hoc one here +struct divides_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) /= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) /= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) /= THRUST_FWD(t2); + } +}; + +// there's no standard modulus_equal functional, so roll an ad hoc one here +struct modulus_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) %= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) %= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) %= THRUST_FWD(t2); + } +}; + +// there's no standard bit_and_equal functional, so roll an ad hoc one here +struct bit_and_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) &= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) &= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) &= THRUST_FWD(t2); + } +}; + +// there's no standard bit_or_equal functional, so roll an ad hoc one here +struct bit_or_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) |= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) |= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) |= THRUST_FWD(t2); + } +}; + +// there's no standard bit_xor_equal functional, so roll an ad hoc one here +struct bit_xor_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) ^= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) ^= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) ^= THRUST_FWD(t2); + } +}; + +// there's no standard bit_lshift_equal functional, so roll an ad hoc one here +struct bit_lshift_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) <<= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) <<= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) <<= THRUST_FWD(t2); + } +}; + +// there's no standard bit_rshift_equal functional, so roll an ad hoc one here +struct bit_rshift_equal +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) >>= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >>= THRUST_FWD(t2)) + { + return THRUST_FWD(t1) >>= THRUST_FWD(t2); + } +}; + +// there's no standard bit_lshift functional, so roll an ad hoc one here +struct bit_lshift +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) << THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) << THRUST_FWD(t2)) + { + return THRUST_FWD(t1) << THRUST_FWD(t2); + } +}; + +// there's no standard bit_rshift functional, so roll an ad hoc one here +struct bit_rshift +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1& t1, T2&& t2) const + noexcept(noexcept(THRUST_FWD(t1) >> THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >> THRUST_FWD(t2)) + { + return THRUST_FWD(t1) >> THRUST_FWD(t2); + } +}; + +#define MAKE_BINARY_COMPOSITE(op, functor) \ + template ::value || is_actor::value, int> = 0> \ + _CCCL_HOST_DEVICE auto operator op(const A& a, const B& b)->decltype(compose(functor{}, a, b)) \ + { \ + return compose(functor{}, a, b); \ + } + +MAKE_BINARY_COMPOSITE(==, thrust::equal_to<>) +MAKE_BINARY_COMPOSITE(!=, thrust::not_equal_to<>) +MAKE_BINARY_COMPOSITE(<, thrust::less<>) +MAKE_BINARY_COMPOSITE(<=, thrust::less_equal<>) +MAKE_BINARY_COMPOSITE(>, thrust::greater<>) +MAKE_BINARY_COMPOSITE(>=, thrust::greater_equal<>) + +MAKE_BINARY_COMPOSITE(+, thrust::plus<>) +MAKE_BINARY_COMPOSITE(-, thrust::minus<>) +MAKE_BINARY_COMPOSITE(*, thrust::multiplies<>) +MAKE_BINARY_COMPOSITE(/, thrust::divides<>) +MAKE_BINARY_COMPOSITE(%, thrust::modulus<>) + +MAKE_BINARY_COMPOSITE(+=, plus_equal) +MAKE_BINARY_COMPOSITE(-=, minus_equal) +MAKE_BINARY_COMPOSITE(*=, multiplies_equal) +MAKE_BINARY_COMPOSITE(/=, divides_equal) +MAKE_BINARY_COMPOSITE(%=, modulus_equal) + +MAKE_BINARY_COMPOSITE(&&, thrust::logical_and<>) +MAKE_BINARY_COMPOSITE(||, thrust::logical_or<>) + +MAKE_BINARY_COMPOSITE(&, thrust::bit_and<>) +MAKE_BINARY_COMPOSITE(|, thrust::bit_or<>) +MAKE_BINARY_COMPOSITE(^, thrust::bit_xor<>) +MAKE_BINARY_COMPOSITE(<<, bit_lshift) +MAKE_BINARY_COMPOSITE(>>, bit_rshift) + +MAKE_BINARY_COMPOSITE(&=, bit_and_equal) +MAKE_BINARY_COMPOSITE(|=, bit_or_equal) +MAKE_BINARY_COMPOSITE(^=, bit_xor_equal) +MAKE_BINARY_COMPOSITE(<<=, bit_lshift_equal) +MAKE_BINARY_COMPOSITE(>>=, bit_rshift_equal) + +#undef MAKE_BINARY_COMPOSITE + +// there's no standard unary_plus functional, so roll an ad hoc one here +struct unary_plus +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const + noexcept(noexcept(+THRUST_FWD(t1))) -> decltype(+THRUST_FWD(t1)) + { + return +THRUST_FWD(t1); + } +}; + +// there's no standard prefix_increment functional, so roll an ad hoc one here +struct prefix_increment +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const + noexcept(noexcept(++THRUST_FWD(t1))) -> decltype(++THRUST_FWD(t1)) + { + return ++THRUST_FWD(t1); + } +}; // end prefix_increment + +// there's no standard postfix_increment functional, so roll an ad hoc one here +struct postfix_increment +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const + noexcept(noexcept(THRUST_FWD(t1)++)) -> decltype(THRUST_FWD(t1)++) + { + return THRUST_FWD(t1)++; + } +}; // end postfix_increment + +// there's no standard prefix_decrement functional, so roll an ad hoc one here +struct prefix_decrement +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const + noexcept(noexcept(--THRUST_FWD(t1))) -> decltype(--THRUST_FWD(t1)) + { + return --THRUST_FWD(t1); + } +}; // end prefix_decrement + +// there's no standard postfix_decrement functional, so roll an ad hoc one here +struct postfix_decrement +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const + noexcept(noexcept(THRUST_FWD(t1)--)) -> decltype(THRUST_FWD(t1)--) + { + return THRUST_FWD(t1)--; + } +}; // end prefix_increment + +// there's no standard bit_not functional, so roll an ad hoc one here +struct bit_not +{ + using is_transparent = void; + + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const + noexcept(noexcept(~THRUST_FWD(t1))) -> decltype(~THRUST_FWD(t1)) + { + return ~THRUST_FWD(t1); + } +}; // end prefix_increment + +#define MAKE_UNARY_COMPOSITE(op, functor) \ + template ::value, int> = 0> \ + _CCCL_HOST_DEVICE auto operator op(const A& a)->decltype(compose(functor{}, a)) \ + { \ + return compose(functor{}, a); \ + } + +MAKE_UNARY_COMPOSITE(+, unary_plus) +MAKE_UNARY_COMPOSITE(-, thrust::negate<>) +MAKE_UNARY_COMPOSITE(++, prefix_increment) +MAKE_UNARY_COMPOSITE(--, prefix_decrement) +MAKE_UNARY_COMPOSITE(!, thrust::logical_not<>) +MAKE_UNARY_COMPOSITE(~, bit_not) + +#undef MAKE_UNARY_COMPOSITE + +#define MAKE_UNARY_COMPOSITE_POSTFIX(op, functor) \ + template ::value, int> = 0> \ + _CCCL_HOST_DEVICE auto operator op(const A& a, int)->decltype(compose(functor{}, a)) \ + { \ + return compose(functor{}, a); \ + } + +MAKE_UNARY_COMPOSITE_POSTFIX(++, postfix_increment) +MAKE_UNARY_COMPOSITE_POSTFIX(--, postfix_decrement) + +#undef MAKE_UNARY_COMPOSITE_POSTFIX + +// there's no standard assign functional, so roll an ad hoc one here +struct assign +{ + _CCCL_EXEC_CHECK_DISABLE + template + _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const + THRUST_DECLTYPE_RETURNS(THRUST_FWD(t1) = THRUST_FWD(t2)) +}; + +template +_CCCL_HOST_DEVICE auto do_assign(const actor& _1, const T& _2) -> decltype(compose(assign{}, _1, _2)) +{ + return compose(assign{}, _1, _2); +} +} // namespace functional +} // namespace detail +THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators/arithmetic_operators.h b/thrust/thrust/detail/functional/operators/arithmetic_operators.h deleted file mode 100644 index 024b0e0d95..0000000000 --- a/thrust/thrust/detail/functional/operators/arithmetic_operators.h +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -_CCCL_HOST_DEVICE actor>, actor>> _CCCL_HOST_DEVICE -operator-(const actor& _1) -{ - return compose(transparent_unary_operator>(), _1); -} // end operator-() - -// there's no standard unary_plus functional, so roll an ad hoc one here -struct unary_plus -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const - noexcept(noexcept(+THRUST_FWD(t1))) -> decltype(+THRUST_FWD(t1)) - { - return +THRUST_FWD(t1); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor>> operator+(const actor& _1) -{ - return compose(transparent_unary_operator(), _1); -} // end operator+() - -template -_CCCL_HOST_DEVICE actor>, actor, typename as_actor::type>> -operator+(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator+() - -template -_CCCL_HOST_DEVICE actor>, typename as_actor::type, actor>> -operator+(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator+() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator+(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator+() - -template -_CCCL_HOST_DEVICE actor>, typename as_actor::type, actor>> -operator-(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator-() - -template -_CCCL_HOST_DEVICE actor>, actor, typename as_actor::type>> -operator-(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator-() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator-(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator-() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator*(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator*() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator*(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator*() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator*(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator*() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator/(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator/() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator/(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator/() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator/(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator/() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator%(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator%() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator%(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator%() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator%(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator%() - -// there's no standard prefix_increment functional, so roll an ad hoc one here -struct prefix_increment -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const - noexcept(noexcept(++THRUST_FWD(t1))) -> decltype(++THRUST_FWD(t1)) - { - return ++THRUST_FWD(t1); - } -}; // end prefix_increment - -template -_CCCL_HOST_DEVICE actor, actor>> -operator++(const actor& _1) -{ - return compose(transparent_unary_operator(), _1); -} // end operator++() - -// there's no standard postfix_increment functional, so roll an ad hoc one here -struct postfix_increment -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const - noexcept(noexcept(THRUST_FWD(t1)++)) -> decltype(THRUST_FWD(t1)++) - { - return THRUST_FWD(t1)++; - } -}; // end postfix_increment - -template -_CCCL_HOST_DEVICE actor, actor>> -operator++(const actor& _1, int) -{ - return compose(transparent_unary_operator(), _1); -} // end operator++() - -// there's no standard prefix_decrement functional, so roll an ad hoc one here -struct prefix_decrement -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const - noexcept(noexcept(--THRUST_FWD(t1))) -> decltype(--THRUST_FWD(t1)) - { - return --THRUST_FWD(t1); - } -}; // end prefix_decrement - -template -_CCCL_HOST_DEVICE actor, actor>> -operator--(const actor& _1) -{ - return compose(transparent_unary_operator(), _1); -} // end operator--() - -// there's no standard postfix_decrement functional, so roll an ad hoc one here -struct postfix_decrement -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const - noexcept(noexcept(THRUST_FWD(t1)--)) -> decltype(THRUST_FWD(t1)--) - { - return THRUST_FWD(t1)--; - } -}; // end prefix_increment - -template -_CCCL_HOST_DEVICE actor, actor>> -operator--(const actor& _1, int) -{ - return compose(transparent_unary_operator(), _1); -} // end operator--() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators/assignment_operator.h b/thrust/thrust/detail/functional/operators/assignment_operator.h deleted file mode 100644 index 990bc601b7..0000000000 --- a/thrust/thrust/detail/functional/operators/assignment_operator.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include -#include - -THRUST_NAMESPACE_BEGIN - -// XXX WAR circular inclusion with this forward declaration -template -struct binary_function; - -namespace detail -{ -namespace functional -{ - -// XXX WAR circular inclusion with this forward declaration -template -struct as_actor; - -// there's no standard assign functional, so roll an ad hoc one here -struct assign -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) = THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) = THRUST_FWD(t2)) - { - return THRUST_FWD(t1) = THRUST_FWD(t2); - } -}; - -template -struct assign_result -{ - using type = actor, actor, typename as_actor::type>>; -}; // end assign_result - -template -_CCCL_HOST_DEVICE typename assign_result::type do_assign(const actor& _1, const T& _2) -{ - return compose(transparent_binary_operator(), _1, as_actor::convert(_2)); -} // end do_assign() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators/bitwise_operators.h b/thrust/thrust/detail/functional/operators/bitwise_operators.h deleted file mode 100644 index c41250a79e..0000000000 --- a/thrust/thrust/detail/functional/operators/bitwise_operators.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -_CCCL_HOST_DEVICE actor>, actor, typename as_actor::type>> -operator&(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&() - -template -_CCCL_HOST_DEVICE actor>, typename as_actor::type, actor>> -operator&(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator&(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&() - -template -_CCCL_HOST_DEVICE actor>, actor, typename as_actor::type>> -operator|(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator|() - -template -_CCCL_HOST_DEVICE actor>, typename as_actor::type, actor>> -operator|(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator|() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator|(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator|() - -template -_CCCL_HOST_DEVICE actor>, actor, typename as_actor::type>> -operator^(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator^() - -template -_CCCL_HOST_DEVICE actor>, typename as_actor::type, actor>> -operator^(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator^() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator^(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator^() - -// there's no standard bit_not functional, so roll an ad hoc one here -struct bit_not -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const - noexcept(noexcept(~THRUST_FWD(t1))) -> decltype(~THRUST_FWD(t1)) - { - return ~THRUST_FWD(t1); - } -}; // end prefix_increment - -template -_CCCL_HOST_DEVICE actor, actor>> _CCCL_HOST_DEVICE -operator~(const actor& _1) -{ - return compose(transparent_unary_operator(), _1); -} // end operator~() - -// there's no standard bit_lshift functional, so roll an ad hoc one here -struct bit_lshift -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) << THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) << THRUST_FWD(t2)) - { - return THRUST_FWD(t1) << THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator<<(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator<<() - -template -_CCCL_HOST_DEVICE actor, typename as_actor::type, actor>> -operator<<(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator<<() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator<<(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator<<() - -// there's no standard bit_rshift functional, so roll an ad hoc one here -struct bit_rshift -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) >> THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >> THRUST_FWD(t2)) - { - return THRUST_FWD(t1) >> THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator>>(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator>>() - -template -_CCCL_HOST_DEVICE actor, typename as_actor::type, actor>> -operator>>(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator>>() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator>>(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator>>() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators/compound_assignment_operators.h b/thrust/thrust/detail/functional/operators/compound_assignment_operators.h deleted file mode 100644 index 5163ba5cc9..0000000000 --- a/thrust/thrust/detail/functional/operators/compound_assignment_operators.h +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -// there's no standard plus_equal functional, so roll an ad hoc one here -struct plus_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) += THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) += THRUST_FWD(t2)) - { - return THRUST_FWD(t1) += THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator+=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator+=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator+=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator+=() - -// there's no standard minus_equal functional, so roll an ad hoc one here -struct minus_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) -= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) -= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) -= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator-=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator-=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator-=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator-=() - -// there's no standard multiplies_equal functional, so roll an ad hoc one here -struct multiplies_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) *= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) *= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) *= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator*=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator*=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator*=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator*=() - -// there's no standard divides_equal functional, so roll an ad hoc one here -struct divides_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) /= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) /= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) /= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator/=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator/=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator/=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator/=() - -// there's no standard modulus_equal functional, so roll an ad hoc one here -struct modulus_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) %= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) %= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) %= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator%=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator%=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator%=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator%=() - -// there's no standard bit_and_equal functional, so roll an ad hoc one here -struct bit_and_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) &= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) &= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) &= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator&=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator&=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator&=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator&=() - -// there's no standard bit_or_equal functional, so roll an ad hoc one here -struct bit_or_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) |= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) |= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) |= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator|=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator|=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator|=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator|=() - -// there's no standard bit_xor_equal functional, so roll an ad hoc one here -struct bit_xor_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) ^= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) ^= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) ^= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator^=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator|=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator^=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator|=() - -// there's no standard bit_lshift_equal functional, so roll an ad hoc one here -struct bit_lshift_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) <<= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) <<= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) <<= THRUST_FWD(t2); - } -}; -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator<<=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator<<=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator<<=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator<<=() - -// there's no standard bit_rshift_equal functional, so roll an ad hoc one here -struct bit_rshift_equal -{ - using is_transparent = void; - - _CCCL_EXEC_CHECK_DISABLE - template - _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const - noexcept(noexcept(THRUST_FWD(t1) >>= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >>= THRUST_FWD(t2)) - { - return THRUST_FWD(t1) >>= THRUST_FWD(t2); - } -}; - -template -_CCCL_HOST_DEVICE actor, actor, typename as_actor::type>> -operator>>=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator>>=() - -template -_CCCL_HOST_DEVICE actor, actor, actor>> -operator>>=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator(), make_actor(_1), make_actor(_2)); -} // end operator>>=() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators/logical_operators.h b/thrust/thrust/detail/functional/operators/logical_operators.h deleted file mode 100644 index 75ed46cc96..0000000000 --- a/thrust/thrust/detail/functional/operators/logical_operators.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator&&(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&&() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator&&(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&&() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator&&(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&&() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator||(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&&() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator||(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&&() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator||(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator&&() - -template -_CCCL_HOST_DEVICE actor>, actor>> -operator!(const actor& _1) -{ - return compose(transparent_unary_operator>(), _1); -} // end operator!() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators/operator_adaptors.h b/thrust/thrust/detail/functional/operators/operator_adaptors.h deleted file mode 100644 index 31587ac734..0000000000 --- a/thrust/thrust/detail/functional/operators/operator_adaptors.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include -#include - -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -// Adapts a transparent unary functor from functional.h (e.g. thrust::negate<>) -// into the Eval interface. -template -struct transparent_unary_operator -{ - template - using operator_type = UnaryFunctor; - - template - using argument = - typename thrust::detail::eval_if::value != 1, - thrust::detail::identity_>, - thrust::detail::functional::argument_helper<0, Env>>::type; - - template - struct result_type_impl - { - using type = decltype(std::declval()(std::declval>())); - }; - - template - using result_type = - typename thrust::detail::eval_if, argument>::value, - thrust::detail::identity_>, - result_type_impl>::type; - - template - struct result - { - using op_type = UnaryFunctor; - using type = result_type; - }; - - template - _CCCL_HOST_DEVICE result_type eval(Env&& e) const THRUST_RETURNS(UnaryFunctor{}(thrust::get<0>(THRUST_FWD(e)))) -}; - -// Adapts a transparent binary functor from functional.h (e.g. thrust::less<>) -// into the Eval interface. -template -struct transparent_binary_operator -{ - template - using operator_type = BinaryFunctor; - - template - using first_argument = - typename thrust::detail::eval_if::value != 2, - thrust::detail::identity_>, - thrust::detail::functional::argument_helper<0, Env>>::type; - - template - using second_argument = - typename thrust::detail::eval_if::value != 2, - thrust::detail::identity_>, - thrust::detail::functional::argument_helper<1, Env>>::type; - - template - struct result_type_impl - { - using type = decltype(std::declval()( - std::declval>(), std::declval>())); - }; - - template - using result_type = - typename thrust::detail::eval_if<(std::is_same, first_argument>::value - || std::is_same, second_argument>::value), - thrust::detail::identity_>, - result_type_impl>::type; - - template - struct result - { - using op_type = BinaryFunctor; - using type = result_type; - }; - - template - _CCCL_HOST_DEVICE result_type eval(Env&& e) const - THRUST_RETURNS(BinaryFunctor{}(thrust::get<0>(e), thrust::get<1>(e))) -}; - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/operators/relational_operators.h b/thrust/thrust/detail/functional/operators/relational_operators.h deleted file mode 100644 index d58c2fb67f..0000000000 --- a/thrust/thrust/detail/functional/operators/relational_operators.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include -#include -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator==(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator==() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator==(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator==() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator==(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator==() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator!=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator!=() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator!=(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator!=() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator!=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator!=() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator>(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator>() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator>(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator>() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator>(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator>() - -template -_CCCL_HOST_DEVICE actor>, actor, typename as_actor::type>> -operator<(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator<() - -template -_CCCL_HOST_DEVICE actor>, typename as_actor::type, actor>> -operator<(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator<() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator<(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator<() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator>=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator>=() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator>=(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator>=() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator>=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator>=() - -template -_CCCL_HOST_DEVICE -actor>, actor, typename as_actor::type>> -operator<=(const actor& _1, const T2& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator<=() - -template -_CCCL_HOST_DEVICE -actor>, typename as_actor::type, actor>> -operator<=(const T1& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator<=() - -template -_CCCL_HOST_DEVICE actor>, actor, actor>> -operator<=(const actor& _1, const actor& _2) -{ - return compose(transparent_binary_operator>(), make_actor(_1), make_actor(_2)); -} // end operator<=() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/placeholder.h b/thrust/thrust/detail/functional/placeholder.h deleted file mode 100644 index a95d4d506a..0000000000 --- a/thrust/thrust/detail/functional/placeholder.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -struct placeholder -{ - using type = actor>; -}; - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/functional/value.h b/thrust/thrust/detail/functional/value.h deleted file mode 100644 index e2ce136b04..0000000000 --- a/thrust/thrust/detail/functional/value.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Portions of this code are derived from -// -// Manjunath Kudlur's Carbon library -// -// and -// -// Based on Boost.Phoenix v1.2 -// Copyright (c) 2001-2002 Joel de Guzman - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header -#include - -THRUST_NAMESPACE_BEGIN -namespace detail -{ -namespace functional -{ - -template -struct actor; - -template -class value -{ -public: - template - struct result - { - using type = T; - }; - - _CCCL_HOST_DEVICE value(const T& arg) - : m_val(arg) - {} - - template - _CCCL_HOST_DEVICE T eval(const Env&) const - { - return m_val; - } - -private: - T m_val; -}; // end value - -template -_CCCL_HOST_DEVICE actor> val(const T& x) -{ - return value(x); -} // end val() - -} // namespace functional -} // namespace detail -THRUST_NAMESPACE_END diff --git a/thrust/thrust/functional.h b/thrust/thrust/functional.h index 4b88f46954..9c8d8d2938 100644 --- a/thrust/thrust/functional.h +++ b/thrust/thrust/functional.h @@ -29,7 +29,7 @@ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header -#include +#include #include From 352638b4125af488608ed032e8d9652b5080eb23 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 14 Aug 2024 13:45:19 +0200 Subject: [PATCH 23/33] Add missing annotations for deprecated debug_sync APIs (#2212) --- cub/cub/device/dispatch/dispatch_histogram.cuh | 1 + cub/cub/device/dispatch/dispatch_rle.cuh | 1 + 2 files changed, 2 insertions(+) diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index c623cda9a2..aa8cc2f5c0 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -1593,6 +1593,7 @@ public: } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 917b5df37b..2ca3527b60 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -544,6 +544,7 @@ struct DeviceRleDispatch } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, From dded5f1ac6c48c71215c70835f9ed0babaad4a3a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 14 Aug 2024 13:56:43 +0200 Subject: [PATCH 24/33] Test thrust headers for disabled half/bf16 support (#2219) --- cub/cmake/CubHeaderTesting.cmake | 2 ++ thrust/cmake/ThrustHeaderTesting.cmake | 37 ++++++++++++++++++++------ thrust/cmake/header_test.in | 15 +++++++++++ 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/cub/cmake/CubHeaderTesting.cmake b/cub/cmake/CubHeaderTesting.cmake index f0ca17186c..fdf9be3be4 100644 --- a/cub/cmake/CubHeaderTesting.cmake +++ b/cub/cmake/CubHeaderTesting.cmake @@ -42,12 +42,14 @@ set(header_definitions "CUB_WRAPPED_NAMESPACE=wrapped_cub") cub_add_header_test(base "${header_definitions}") +# Check that BF16 support can be disabled set(header_definitions "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" "CUB_WRAPPED_NAMESPACE=wrapped_cub" "CCCL_DISABLE_BF16_SUPPORT") cub_add_header_test(bf16 "${header_definitions}") +# Check that half support can be disabled set(header_definitions "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" "CUB_WRAPPED_NAMESPACE=wrapped_cub" diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake index ad438b0f87..4c1d07f744 100644 --- a/thrust/cmake/ThrustHeaderTesting.cmake +++ b/thrust/cmake/ThrustHeaderTesting.cmake @@ -7,7 +7,7 @@ # Meta target for all configs' header builds: add_custom_target(thrust.all.headers) -foreach(thrust_target IN LISTS THRUST_TARGETS) +function(thrust_add_header_test thrust_target label definitions) thrust_get_target_property(config_host ${thrust_target} HOST) thrust_get_target_property(config_device ${thrust_target} DEVICE) thrust_get_target_property(config_prefix ${thrust_target} PREFIX) @@ -115,14 +115,10 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) list(APPEND headertest_srcs "${headertest_src}") endforeach() - set(headertest_target ${config_prefix}.headers) + set(headertest_target ${config_prefix}.headers.${label}) add_library(${headertest_target} OBJECT ${headertest_srcs}) target_link_libraries(${headertest_target} PUBLIC ${thrust_target}) - # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros: - target_compile_definitions(${headertest_target} PRIVATE - "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" - "CUB_WRAPPED_NAMESPACE=wrapped_cub" - ) + target_compile_definitions(${headertest_target} PRIVATE ${header_definitions}) thrust_clone_target_properties(${headertest_target} ${thrust_target}) if ("CUDA" STREQUAL "${config_device}") @@ -141,4 +137,29 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) add_dependencies(thrust.all.headers ${headertest_target}) add_dependencies(${config_prefix}.all ${headertest_target}) -endforeach() +endfunction() + +foreach(thrust_target IN LISTS THRUST_TARGETS) + # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros: + set(header_definitions + "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" + "CUB_WRAPPED_NAMESPACE=wrapped_cub") + thrust_add_header_test(${thrust_target} base "${header_definitions}") + + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if ("CUDA" STREQUAL "${config_device}") + # Check that BF16 support can be disabled + set(header_definitions + "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" + "CUB_WRAPPED_NAMESPACE=wrapped_cub" + "CCCL_DISABLE_BF16_SUPPORT") + thrust_add_header_test(${thrust_target} bf16 "${header_definitions}") + + # Check that half support can be disabled + set(header_definitions + "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" + "CUB_WRAPPED_NAMESPACE=wrapped_cub" + "CCCL_DISABLE_FP16_SUPPORT") + thrust_add_header_test(${thrust_target} half "${header_definitions}") + endif() +endforeach () diff --git a/thrust/cmake/header_test.in b/thrust/cmake/header_test.in index 59e44e03c1..236cb9bde4 100644 --- a/thrust/cmake/header_test.in +++ b/thrust/cmake/header_test.in @@ -64,3 +64,18 @@ #endif // THRUST_IGNORE_MACRO_CHECKS #include + +#if defined(CCCL_DISABLE_BF16_SUPPORT) +#if defined(__CUDA_BF16_TYPES_EXIST__) +#error Thrust should not include cuda_bf16.h when BF16 support is disabled +#endif // __CUDA_BF16_TYPES_EXIST__ +#endif // CCCL_DISABLE_BF16_SUPPORT + +#if defined(CCCL_DISABLE_FP16_SUPPORT) +#if defined(__CUDA_FP16_TYPES_EXIST__) +#error Thrust should not include cuda_fp16.h when half support is disabled +#endif // __CUDA_FP16_TYPES_EXIST__ +#if defined(__CUDA_BF16_TYPES_EXIST__) +#error Thrust should not include cuda_bf16.h when half support is disabled +#endif // __CUDA_BF16_TYPES_EXIST__ +#endif // CCCL_DISABLE_FP16_SUPPORT From 1981c4972c0fc95b4180c16cf3b39f3fd87c1c25 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 14 Aug 2024 15:00:35 +0200 Subject: [PATCH 25/33] Make cuda::std::max constexpr in C++11 (#2107) --- libcudacxx/include/cuda/std/__algorithm/comp.h | 2 +- libcudacxx/include/cuda/std/__algorithm/max.h | 4 ++-- .../std/algorithms/alg.sorting/alg.min.max/max.pass.cpp | 4 ++++ .../std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp | 4 ++++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/libcudacxx/include/cuda/std/__algorithm/comp.h b/libcudacxx/include/cuda/std/__algorithm/comp.h index 5427fc7e16..2e5c81ed45 100644 --- a/libcudacxx/include/cuda/std/__algorithm/comp.h +++ b/libcudacxx/include/cuda/std/__algorithm/comp.h @@ -46,7 +46,7 @@ struct __is_trivial_equality_predicate<__equal_to, _Lhs, _Rhs> : true_type struct __less { template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(const _Tp& __lhs, const _Up& __rhs) const noexcept(noexcept(__lhs < __rhs)) { return __lhs < __rhs; diff --git a/libcudacxx/include/cuda/std/__algorithm/max.h b/libcudacxx/include/cuda/std/__algorithm/max.h index 4fec573393..28677d6b7a 100644 --- a/libcudacxx/include/cuda/std/__algorithm/max.h +++ b/libcudacxx/include/cuda/std/__algorithm/max.h @@ -30,14 +30,14 @@ _CCCL_PUSH_MACROS _LIBCUDACXX_BEGIN_NAMESPACE_STD template -_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& +_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& max(const _Tp& __a, const _Tp& __b, _Compare __comp) { return __comp(__a, __b) ? __b : __a; } template -_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& max(const _Tp& __a, const _Tp& __b) +_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& max(const _Tp& __a, const _Tp& __b) { return _CUDA_VSTD::max(__a, __b, __less{}); } diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp index 15631df845..c1b6ef02de 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp @@ -53,6 +53,10 @@ int main(int, char**) test(); #if TEST_STD_VER >= 2014 static_assert(test(), ""); +#else // TEST_STD_VER >= 2014 + constexpr int x = 0; + constexpr int y = 1; + static_assert(&cuda::std::max(x, y) == &y, ""); #endif // TEST_STD_VER >= 2014 return 0; diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp index d24b1ffe30..526f692f08 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp @@ -55,6 +55,10 @@ int main(int, char**) test(); #if TEST_STD_VER >= 2014 static_assert(test(), ""); +#else // TEST_STD_VER >= 2014 + constexpr int x = 0; + constexpr int y = 1; + static_assert(&cuda::std::max(x, y, cuda::std::greater()) == &x, ""); #endif // TEST_STD_VER >= 2014 return 0; From 73df2b0eecab0f3d7b2693aff590c7ef139e51bf Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 14 Aug 2024 15:20:11 +0200 Subject: [PATCH 26/33] Fix ForEachCopyN for non-contiguous iterators (#2220) By falling back to a non-load-vectorizing code path. Fixes: #2207 --- cub/cub/device/device_for.cuh | 28 +++++++------------------ cub/test/catch2_test_device_for.cu | 21 +++++++++++++++++++ cub/test/catch2_test_device_for_copy.cu | 21 +++++++++++++++++++ 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh index 5384748942..0e0bcaa36c 100644 --- a/cub/cub/device/device_for.cuh +++ b/cub/cub/device/device_for.cuh @@ -145,13 +145,12 @@ private: return detail::for_each::dispatch_t::dispatch(num_items, wrapped_op_t{first, op}, stream); } - template + template CUB_RUNTIME_FUNCTION static cudaError_t for_each_n( - RandomAccessIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */) + ContiguousIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */) { - auto unwrapped_first = THRUST_NS_QUALIFIER::raw_pointer_cast(&*first); - using wrapped_op_t = - detail::for_each::op_wrapper_vectorized_t>; + auto* unwrapped_first = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(first); + using wrapped_op_t = detail::for_each::op_wrapper_vectorized_t>; if (is_aligned(unwrapped_first)) { // Vectorize loads @@ -587,14 +586,12 @@ private: CUB_RUNTIME_FUNCTION static cudaError_t ForEachNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { - using offset_t = NumItemsT; - using use_vectorization_t = ::cuda::std::integral_constant; - + using offset_t = NumItemsT; // Disable auto-vectorization for now: // constexpr bool use_vectorization = // detail::for_each::can_regain_copy_freedom, OpT>::value // && THRUST_NS_QUALIFIER::is_contiguous_iterator::value; - + using use_vectorization_t = ::cuda::std::bool_constant; return for_each_n(first, num_items, op, stream, use_vectorization_t{}); } @@ -717,12 +714,8 @@ private: CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { - static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator::value, - "Iterator must be contiguous"); - using offset_t = NumItemsT; - using use_vectorization_t = ::cuda::std::integral_constant; - + using use_vectorization_t = THRUST_NS_QUALIFIER::is_contiguous_iterator; return for_each_n(first, num_items, op, stream, use_vectorization_t{}); } @@ -837,13 +830,8 @@ public: ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopy"); - static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator::value, - "Iterator must be contiguous"); - - using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits::difference_type; - + using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits::difference_type; const auto num_items = static_cast(THRUST_NS_QUALIFIER::distance(first, last)); - return ForEachCopyNNoNVTX(first, num_items, op, stream); } }; diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu index 62ccfec02c..e54eb3ebc8 100644 --- a/cub/test/catch2_test_device_for.cu +++ b/cub/test/catch2_test_device_for.cu @@ -29,6 +29,7 @@ // above header needs to be included first #include +#include #include #include @@ -246,3 +247,23 @@ CUB_TEST("Device for each n works with unaligned vectors", "[for][device]", offs REQUIRE(num_of_once_marked_items == num_items); } + +CUB_TEST("Device for each works with couting iterator", "[for][device]") +{ + using offset_t = int; + constexpr offset_t max_items = 5000000; + constexpr offset_t min_items = 1; + const offset_t num_items = GENERATE_COPY( + take(3, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + + const auto it = cub::CountingInputIterator{0}; + c2h::device_vector counts(num_items); + device_for_each(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())}); + + const auto num_of_once_marked_items = static_cast(thrust::count(counts.begin(), counts.end(), 1)); + REQUIRE(num_of_once_marked_items == num_items); +} diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu index 2263b3987e..fdb117eff6 100644 --- a/cub/test/catch2_test_device_for_copy.cu +++ b/cub/test/catch2_test_device_for_copy.cu @@ -29,6 +29,7 @@ // above header needs to be included first #include +#include #include #include @@ -186,3 +187,23 @@ CUB_TEST("Device for each n works with unaligned vectors", "[for_copy][device]", REQUIRE(num_of_once_marked_items == num_items); } + +CUB_TEST("Device for each works with couting iterator", "[for][device]") +{ + using offset_t = int; + constexpr offset_t max_items = 5000000; + constexpr offset_t min_items = 1; + const offset_t num_items = GENERATE_COPY( + take(3, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + + const auto it = cub::CountingInputIterator{0}; + c2h::device_vector counts(num_items); + device_for_each_copy(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())}); + + const auto num_of_once_marked_items = static_cast(thrust::count(counts.begin(), counts.end(), 1)); + REQUIRE(num_of_once_marked_items == num_items); +} From cbce14b74a25abd2fcee5581d52b655e6f75e24b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 14 Aug 2024 18:44:48 +0200 Subject: [PATCH 27/33] Configure CUB/Thrust for C++17 by default (#2217) --- cub/cmake/CubBuildTargetList.cmake | 10 +++++----- thrust/cmake/ThrustMultiConfig.cmake | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cub/cmake/CubBuildTargetList.cmake b/cub/cmake/CubBuildTargetList.cmake index 5277f59e99..2a0827a894 100644 --- a/cub/cmake/CubBuildTargetList.cmake +++ b/cub/cmake/CubBuildTargetList.cmake @@ -40,12 +40,12 @@ set(CUB_CPP_DIALECT_OPTIONS ) define_property(TARGET PROPERTY _CUB_DIALECT - BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17." - FULL_DOCS "A target's C++ dialect: 11, 14, or 17." + BRIEF_DOCS "A target's C++ dialect: 11, 14, 17 or 20." + FULL_DOCS "A target's C++ dialect: 11, 14, 17 or 20." ) define_property(TARGET PROPERTY _CUB_PREFIX - BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp14'." - FULL_DOCS "A prefix describing the config, eg. 'cub.cpp14'." + BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp17'." + FULL_DOCS "A prefix describing the config, eg. 'cub.cpp17'." ) function(cub_set_target_properties target_name dialect prefix) @@ -134,7 +134,7 @@ function(cub_build_target_list) foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS) # Create CMake options: set(default_value OFF) - if (dialect EQUAL 14) # Default to just 14 on: + if (dialect EQUAL 17) # Default to just 17 on: set(default_value ON) endif() option(CUB_ENABLE_DIALECT_CPP${dialect} diff --git a/thrust/cmake/ThrustMultiConfig.cmake b/thrust/cmake/ThrustMultiConfig.cmake index aa9fc02266..46bffc761c 100644 --- a/thrust/cmake/ThrustMultiConfig.cmake +++ b/thrust/cmake/ThrustMultiConfig.cmake @@ -15,7 +15,7 @@ function(thrust_configure_multiconfig) # Handle dialect options: foreach (dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS) set(default_value OFF) - if (dialect EQUAL 14) # Default to just 14 on: + if (dialect EQUAL 17) # Default to just 17 on: set(default_value ON) endif() option(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect} @@ -112,7 +112,7 @@ function(thrust_configure_multiconfig) set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE STRING) endif() - set(THRUST_CPP_DIALECT 14 + set(THRUST_CPP_DIALECT 17 CACHE STRING "The C++ standard to target: ${THRUST_CPP_DIALECT_OPTIONS}" ) set_property(CACHE THRUST_CPP_DIALECT From e42341248f5340bf46430e7af3987ec98011afe6 Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Thu, 15 Aug 2024 15:15:57 +0200 Subject: [PATCH 28/33] Allow installing components when downstream (#2096) In the @acts-project we adopt an (admittedly somewhat unconventional) build system in which software A depends on B, and B depends on CCCL. The setup is that we want to install B into a prefix, and then try to build A against B. The problem arises is that we are using CMake to dynamically fetch CCCL using the so-called "FetchContent" mechanism, which downloads CCCL and then adds it as a subdirectory. The core problem is that installing software B which has included CCCL does not actually install CCCL in the same prefix, so software A cannot then load software B as CCCL is not installed. The reason this happens is that CMakeLists.txt:28 (at the time of writing) returns from the CMake configuration stage early, and leaves the CUB, Thrust, and libcudacxx directories unincluded (see lines 70 to 72). Although this is, again, an unconventional and rare scenario, it should be easy to add support for this kind of build, and I hope the CCCL devs would agree that it might be worth doing. In this commit, I remove the early return and replace it with additional if-statements. This commit should leave any existing workflows completely untouched, but should make it easier to use CCCL in the way we do in @acts-project. --- CMakeLists.txt | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d1ae64e2f5..198727dc5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,17 +25,18 @@ endif() # Support adding CCCL to a parent project via add_subdirectory. if (NOT CCCL_TOPLEVEL_PROJECT) include(cmake/CCCLAddSubdir.cmake) - return() endif() # We require a higher cmake version for dev builds -cmake_minimum_required(VERSION 3.21) +if (CCCL_TOPLEVEL_PROJECT) + cmake_minimum_required(VERSION 3.21) +endif() -option(CCCL_ENABLE_LIBCUDACXX "Enable the libcu++ developer build." ON) -option(CCCL_ENABLE_CUB "Enable the CUB developer build." ON) -option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ON) -option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ON) -option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ON) +option(CCCL_ENABLE_LIBCUDACXX "Enable the libcu++ developer build." ${CCCL_TOPLEVEL_PROJECT}) +option(CCCL_ENABLE_CUB "Enable the CUB developer build." ${CCCL_TOPLEVEL_PROJECT}) +option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ${CCCL_TOPLEVEL_PROJECT}) +option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ${CCCL_TOPLEVEL_PROJECT}) +option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ${CCCL_TOPLEVEL_PROJECT}) option(CCCL_ENABLE_BENCHMARKS "Enable CUDA C++ Core Library benchmarks." OFF) option(CCCL_ENABLE_UNSTABLE "Enable targets and developer build options for unstable projects." OFF) @@ -44,27 +45,28 @@ if (CCCL_ENABLE_UNSTABLE) option(CCCL_ENABLE_CUDAX "Enable the CUDA Experimental developer build." ON) endif() - include(CTest) enable_testing() -include(cmake/CCCLUtilities.cmake) # include this first -include(cmake/CCCLClangdCompileInfo.cmake) +if (CCCL_TOPLEVEL_PROJECT) + include(cmake/CCCLUtilities.cmake) # include this first + include(cmake/CCCLClangdCompileInfo.cmake) +endif() if (CCCL_ENABLE_LIBCUDACXX) - set(LIBCUDACXX_TOPLEVEL_PROJECT ON) + set(LIBCUDACXX_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT}) endif() if (CCCL_ENABLE_CUB) - set(CUB_TOPLEVEL_PROJECT ON) + set(CUB_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT}) endif() if (CCCL_ENABLE_THRUST) - set(THRUST_TOPLEVEL_PROJECT ON) + set(THRUST_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT}) endif() if (CCCL_ENABLE_CUDAX) - set(cudax_TOPLEVEL_PROJECT ON) + set(cudax_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT}) endif() add_subdirectory(libcudacxx) From 532ff47db0aeae4a3fd4a6e12514e89dc0550a31 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 15 Aug 2024 20:57:48 +0200 Subject: [PATCH 29/33] Rename the memory resources to drop the superfluous prefix `cuda_` (#2243) --- cudax/test/containers/uninitialized_buffer.cu | 4 +- ...ry_resource.h => device_memory_resource.h} | 69 +++++++++-------- ...y_resource.h => managed_memory_resource.h} | 77 +++++++++---------- ...ry_resource.h => pinned_memory_resource.h} | 77 +++++++++---------- libcudacxx/include/cuda/memory_resource | 6 +- .../allocate.pass.cpp | 2 +- .../equality.pass.cpp | 18 ++--- .../traits.pass.cpp | 2 +- .../allocate.pass.cpp | 2 +- .../equality.pass.cpp | 18 ++--- .../traits.pass.cpp | 2 +- .../allocate.pass.cpp | 2 +- .../equality.pass.cpp | 18 ++--- .../traits.pass.cpp | 2 +- 14 files changed, 150 insertions(+), 149 deletions(-) rename libcudacxx/include/cuda/__memory_resource/{cuda_memory_resource.h => device_memory_resource.h} (61%) rename libcudacxx/include/cuda/__memory_resource/{cuda_managed_memory_resource.h => managed_memory_resource.h} (58%) rename libcudacxx/include/cuda/__memory_resource/{cuda_pinned_memory_resource.h => pinned_memory_resource.h} (58%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_memory_resource => device_memory_resource}/allocate.pass.cpp (98%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_memory_resource => device_memory_resource}/equality.pass.cpp (88%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_memory_resource => device_memory_resource}/traits.pass.cpp (96%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_managed_memory_resource => managed_memory_resource}/allocate.pass.cpp (97%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_managed_memory_resource => managed_memory_resource}/equality.pass.cpp (84%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_pinned_memory_resource => managed_memory_resource}/traits.pass.cpp (95%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_pinned_memory_resource => pinned_memory_resource}/allocate.pass.cpp (97%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_pinned_memory_resource => pinned_memory_resource}/equality.pass.cpp (83%) rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_managed_memory_resource => pinned_memory_resource}/traits.pass.cpp (95%) diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index 73b2a93887..56872e0e54 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -61,7 +61,7 @@ TEMPLATE_TEST_CASE( static_assert(!cuda::std::is_copy_constructible::value, ""); static_assert(!cuda::std::is_copy_assignable::value, ""); - cuda::mr::cuda_memory_resource resource{}; + cuda::mr::device_memory_resource resource{}; SECTION("construction") { @@ -89,7 +89,7 @@ TEMPLATE_TEST_CASE( { static_assert(!cuda::std::is_copy_assignable::value, ""); { - cuda::mr::cuda_managed_memory_resource other_resource{}; + cuda::mr::managed_memory_resource other_resource{}; uninitialized_buffer input{other_resource, 42}; uninitialized_buffer buf{resource, 1337}; const auto* old_ptr = buf.data(); diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h similarity index 61% rename from libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h rename to libcudacxx/include/cuda/__memory_resource/device_memory_resource.h index 289dc8c8b3..02e367e041 100644 --- a/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h @@ -39,20 +39,20 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR -//! @brief cuda_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation. +//! @brief device_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation. //! By default uses device 0 to allocate memory -class cuda_memory_resource +class device_memory_resource { private: int __device_id_{0}; public: - //! @brief default constructs a cuda_memory_resource allocating memory on device 0 - cuda_memory_resource() = default; + //! @brief default constructs a device_memory_resource allocating memory on device 0 + device_memory_resource() = default; - //! @brief default constructs a cuda_memory_resource allocating memory on device \p __device_id + //! @brief default constructs a device_memory_resource allocating memory on device \p __device_id //! @param __device_id The id of the device we are allocating memory on - constexpr cuda_memory_resource(const int __device_id) noexcept + constexpr device_memory_resource(const int __device_id) noexcept : __device_id_(__device_id) {} @@ -85,65 +85,65 @@ class cuda_memory_resource { // We need to ensure that the provided alignment matches the minimal provided alignment _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment), - "Invalid alignment passed to cuda_memory_resource::deallocate."); - _CCCL_ASSERT_CUDA_API(::cudaFree, "cuda_memory_resource::deallocate failed", __ptr); + "Invalid alignment passed to device_memory_resource::deallocate."); + _CCCL_ASSERT_CUDA_API(::cudaFree, "device_memory_resource::deallocate failed", __ptr); (void) __alignment; } - //! @brief Equality comparison with another \c cuda_memory_resource - //! @param __other The other \c cuda_memory_resource + //! @brief Equality comparison with another \c device_memory_resource + //! @param __other The other \c device_memory_resource //! @return true, if both resources hold the same device id - _CCCL_NODISCARD constexpr bool operator==(cuda_memory_resource const& __other) const noexcept + _CCCL_NODISCARD constexpr bool operator==(device_memory_resource const& __other) const noexcept { return __device_id_ == __other.__device_id_; } # if _CCCL_STD_VER <= 2017 - //! @brief Inequality comparison with another \c cuda_memory_resource - //! @param __other The other \c cuda_memory_resource + //! @brief Inequality comparison with another \c device_memory_resource + //! @param __other The other \c device_memory_resource //! @return true, if both resources hold different device id's - _CCCL_NODISCARD constexpr bool operator!=(cuda_memory_resource const& __other) const noexcept + _CCCL_NODISCARD constexpr bool operator!=(device_memory_resource const& __other) const noexcept { return __device_id_ != __other.__device_id_; } # endif // _CCCL_STD_VER <= 2017 - //! @brief Equality comparison between a \c cuda_memory_resource and another resource - //! @param __lhs The \c cuda_memory_resource + //! @brief Equality comparison between a \c device_memory_resource and another resource + //! @param __lhs The \c device_memory_resource //! @param __rhs The resource to compare to //! @return If the underlying types are equality comparable, returns the result of equality comparison of both //! resources. Otherwise, returns false. template - _CCCL_NODISCARD_FRIEND auto operator==(cuda_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; } # if _CCCL_STD_VER <= 2017 - //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&) + //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&) template - _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_memory_resource const& __lhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; } - //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&) + //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&) template - _CCCL_NODISCARD_FRIEND auto operator!=(cuda_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; } - //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&) + //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&) template - _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_memory_resource const& __lhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; } # endif // _CCCL_STD_VER <= 2017 //! @brief Enables the \c device_accessible property - friend constexpr void get_property(cuda_memory_resource const&, device_accessible) noexcept {} + friend constexpr void get_property(device_memory_resource const&, device_accessible) noexcept {} //! @brief Checks whether the passed in alignment is valid static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept @@ -151,7 +151,10 @@ class cuda_memory_resource return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0); } }; -static_assert(resource_with, ""); +static_assert(resource_with, ""); + +// For backward compatability +using cuda_memory_resource _LIBCUDACXX_DEPRECATED = device_memory_resource; _LIBCUDACXX_END_NAMESPACE_CUDA_MR diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h similarity index 58% rename from libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h rename to libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h index a8a42841de..d899ab95a2 100644 --- a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h @@ -38,8 +38,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR -//! @brief \c cuda_managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation. -class cuda_managed_memory_resource +//! @brief \c managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation. +class managed_memory_resource { private: unsigned int __flags_ = cudaMemAttachGlobal; @@ -47,10 +47,10 @@ class cuda_managed_memory_resource static constexpr unsigned int __available_flags = cudaMemAttachGlobal | cudaMemAttachHost; public: - constexpr cuda_managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept + constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept : __flags_(__flags & __available_flags) { - _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to cuda_managed_memory_resource"); + _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource"); } //! @brief Allocate CUDA unified memory of size at least \p __bytes. @@ -80,74 +80,70 @@ class cuda_managed_memory_resource { // We need to ensure that the provided alignment matches the minimal provided alignment _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment), - "Invalid alignment passed to cuda_managed_memory_resource::deallocate."); - _CCCL_ASSERT_CUDA_API(::cudaFree, "cuda_managed_memory_resource::deallocate failed", __ptr); + "Invalid alignment passed to managed_memory_resource::deallocate."); + _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr); (void) __alignment; } - //! @brief Equality comparison with another \c cuda_managed_memory_resource - //! @param __other The other \c cuda_managed_memory_resource - //! @return Whether both \c cuda_managed_memory_resource were constructed with the same flags - _CCCL_NODISCARD constexpr bool operator==(cuda_managed_memory_resource const& __other) const noexcept + //! @brief Equality comparison with another \c managed_memory_resource + //! @param __other The other \c managed_memory_resource + //! @return Whether both \c managed_memory_resource were constructed with the same flags + _CCCL_NODISCARD constexpr bool operator==(managed_memory_resource const& __other) const noexcept { return __flags_ == __other.__flags_; } # if _CCCL_STD_VER <= 2017 - //! @brief Inequality comparison with another \c cuda_managed_memory_resource - //! @param __other The other \c cuda_managed_memory_resource - //! @return Whether both \c cuda_managed_memory_resource were constructed with different flags - _CCCL_NODISCARD constexpr bool operator!=(cuda_managed_memory_resource const& __other) const noexcept + //! @brief Inequality comparison with another \c managed_memory_resource + //! @param __other The other \c managed_memory_resource + //! @return Whether both \c managed_memory_resource were constructed with different flags + _CCCL_NODISCARD constexpr bool operator!=(managed_memory_resource const& __other) const noexcept { return __flags_ != __other.__flags_; } # endif // _CCCL_STD_VER <= 2017 - //! @brief Equality comparison between a \c cuda_managed_memory_resource and another resource - //! @param __lhs The \c cuda_managed_memory_resource + //! @brief Equality comparison between a \c managed_memory_resource and another resource + //! @param __lhs The \c managed_memory_resource //! @param __rhs The resource to compare to //! @return If the underlying types are equality comparable, returns the result of equality comparison of both //! resources. Otherwise, returns false. template - _CCCL_NODISCARD_FRIEND auto operator==(cuda_managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - == resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; } # if _CCCL_STD_VER <= 2017 - //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource + //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource //! const&) template - _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_managed_memory_resource const& __lhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - == resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; } - //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource + //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource //! const&) template - _CCCL_NODISCARD_FRIEND auto operator!=(cuda_managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator!=(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - != resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; } - //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource + //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource //! const&) template - _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_managed_memory_resource const& __lhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - != resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; } # endif // _CCCL_STD_VER <= 2017 //! @brief Enables the \c device_accessible property - friend constexpr void get_property(cuda_managed_memory_resource const&, device_accessible) noexcept {} + friend constexpr void get_property(managed_memory_resource const&, device_accessible) noexcept {} //! @brief Enables the \c host_accessible property - friend constexpr void get_property(cuda_managed_memory_resource const&, host_accessible) noexcept {} + friend constexpr void get_property(managed_memory_resource const&, host_accessible) noexcept {} //! @brief Checks whether the passed in alignment is valid static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept @@ -155,8 +151,11 @@ class cuda_managed_memory_resource return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0); } }; -static_assert(resource_with, ""); -static_assert(resource_with, ""); +static_assert(resource_with, ""); +static_assert(resource_with, ""); + +// For backward compatability +using cuda_managed_memory_resource _LIBCUDACXX_DEPRECATED = managed_memory_resource; _LIBCUDACXX_END_NAMESPACE_CUDA_MR diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h similarity index 58% rename from libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h rename to libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h index f8fc3a25ce..c33ad10235 100644 --- a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h @@ -39,8 +39,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR -//! @brief cuda_pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation. -class cuda_pinned_memory_resource +//! @brief pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation. +class pinned_memory_resource { private: unsigned int __flags_ = cudaHostAllocDefault; @@ -49,10 +49,10 @@ class cuda_pinned_memory_resource cudaHostAllocDefault | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined; public: - constexpr cuda_pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept + constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept : __flags_(__flags & __available_flags) { - _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to cuda_pinned_memory_resource"); + _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource"); } //! @brief Allocate host memory of size at least \p __bytes. @@ -82,71 +82,67 @@ class cuda_pinned_memory_resource { // We need to ensure that the provided alignment matches the minimal provided alignment _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment), - "Invalid alignment passed to cuda_pinned_memory_resource::deallocate."); - _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "cuda_pinned_memory_resource::deallocate failed", __ptr); + "Invalid alignment passed to pinned_memory_resource::deallocate."); + _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr); (void) __alignment; } - //! @brief Equality comparison with another \c cuda_pinned_memory_resource - //! @param __other The other \c cuda_pinned_memory_resource - //! @return Whether both \c cuda_pinned_memory_resource were constructed with the same flags - _CCCL_NODISCARD constexpr bool operator==(cuda_pinned_memory_resource const& __other) const noexcept + //! @brief Equality comparison with another \c pinned_memory_resource + //! @param __other The other \c pinned_memory_resource + //! @return Whether both \c pinned_memory_resource were constructed with the same flags + _CCCL_NODISCARD constexpr bool operator==(pinned_memory_resource const& __other) const noexcept { return __flags_ == __other.__flags_; } # if _CCCL_STD_VER <= 2017 - //! @brief Equality comparison with another \c cuda_pinned_memory_resource - //! @param __other The other \c cuda_pinned_memory_resource - //! @return Whether both \c cuda_pinned_memory_resource were constructed with different flags - _CCCL_NODISCARD constexpr bool operator!=(cuda_pinned_memory_resource const& __other) const noexcept + //! @brief Equality comparison with another \c pinned_memory_resource + //! @param __other The other \c pinned_memory_resource + //! @return Whether both \c pinned_memory_resource were constructed with different flags + _CCCL_NODISCARD constexpr bool operator!=(pinned_memory_resource const& __other) const noexcept { return __flags_ != __other.__flags_; } # endif // _CCCL_STD_VER <= 2017 - //! @brief Equality comparison between a \c cuda_pinned_memory_resource and another resource - //! @param __lhs The \c cuda_pinned_memory_resource + //! @brief Equality comparison between a \c pinned_memory_resource and another resource + //! @param __lhs The \c pinned_memory_resource //! @param __rhs The resource to compare to //! @return If the underlying types are equality comparable, returns the result of equality comparison of both //! resources. Otherwise, returns false. template - _CCCL_NODISCARD_FRIEND auto operator==(cuda_pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - == resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; } # if _CCCL_STD_VER <= 2017 - //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&) + //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&) template - _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_pinned_memory_resource const& __lhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - == resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)}; } - //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&) + //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&) template - _CCCL_NODISCARD_FRIEND auto operator!=(cuda_pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator!=(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - != resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; } - //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&) + //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&) template - _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_pinned_memory_resource const& __lhs) noexcept - _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) + _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource) { - return resource_ref<>{const_cast(__lhs)} - != resource_ref<>{const_cast<_Resource&>(__rhs)}; + return resource_ref<>{const_cast(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)}; } # endif // _CCCL_STD_VER <= 2017 //! @brief Enables the \c device_accessible property - friend constexpr void get_property(cuda_pinned_memory_resource const&, device_accessible) noexcept {} + friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {} //! @brief Enables the \c host_accessible property - friend constexpr void get_property(cuda_pinned_memory_resource const&, host_accessible) noexcept {} + friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {} //! @brief Checks whether the passed in alignment is valid static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept @@ -154,8 +150,11 @@ class cuda_pinned_memory_resource return __alignment <= default_cuda_malloc_host_alignment && (default_cuda_malloc_host_alignment % __alignment == 0); } }; -static_assert(resource_with, ""); -static_assert(resource_with, ""); +static_assert(resource_with, ""); +static_assert(resource_with, ""); + +// For backward compatability +using cuda_pinned_memory_resource _LIBCUDACXX_DEPRECATED = pinned_memory_resource; _LIBCUDACXX_END_NAMESPACE_CUDA_MR diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource index a6aced1179..d3c1ae1f91 100644 --- a/libcudacxx/include/cuda/memory_resource +++ b/libcudacxx/include/cuda/memory_resource @@ -31,10 +31,10 @@ //! //!@endrst -#include -#include -#include +#include #include +#include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp similarity index 98% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp index 073de36074..51c4a5e830 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp @@ -30,7 +30,7 @@ void ensure_device_ptr(void* ptr) void test() { - cuda::mr::cuda_memory_resource res{}; + cuda::mr::device_memory_resource res{}; { // allocate / deallocate auto* ptr = res.allocate(42); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp similarity index 88% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp index 50fd7476ba..770e0d71d7 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp @@ -67,9 +67,9 @@ static_assert(cuda::mr::async_resource_with, ""); @@ -77,15 +77,15 @@ static_assert(cuda::mr::resource, ""); void test() { - cuda::mr::cuda_memory_resource first{}; - { // comparison against a plain cuda_memory_resource - cuda::mr::cuda_memory_resource second{}; + cuda::mr::device_memory_resource first{}; + { // comparison against a plain device_memory_resource + cuda::mr::device_memory_resource second{}; assert(first == second); assert(!(first != second)); } - { // comparison against a cuda_memory_resource wrapped inside a resource_ref - cuda::mr::cuda_memory_resource second{}; + { // comparison against a device_memory_resource wrapped inside a resource_ref + cuda::mr::device_memory_resource second{}; cuda::mr::resource_ref second_ref{second}; assert(first == second_ref); assert(!(first != second_ref)); @@ -93,8 +93,8 @@ void test() assert(!(second_ref != first)); } - { // comparison against a cuda_memory_resource wrapped inside a resource_ref<> - cuda::mr::cuda_memory_resource second{}; + { // comparison against a device_memory_resource wrapped inside a resource_ref<> + cuda::mr::device_memory_resource second{}; cuda::mr::resource_ref<> second_ref{second}; assert(first == second_ref); assert(!(first != second_ref)); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp similarity index 96% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp index a8ae126fce..d642b83bf0 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp @@ -15,7 +15,7 @@ #include #include -using resource = cuda::mr::cuda_memory_resource; +using resource = cuda::mr::device_memory_resource; static_assert(!cuda::std::is_trivial::value, ""); static_assert(!cuda::std::is_trivially_default_constructible::value, ""); static_assert(cuda::std::is_trivially_copy_constructible::value, ""); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp similarity index 97% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/allocate.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp index 6e9fd76f8e..df0652d5a1 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/allocate.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp @@ -30,7 +30,7 @@ void ensure_managed_ptr(void* ptr) void test(const unsigned int flag) { - cuda::mr::cuda_managed_memory_resource res{flag}; + cuda::mr::managed_memory_resource res{flag}; { // allocate / deallocate auto* ptr = res.allocate(42); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp similarity index 84% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp index f2e14578f7..9acc1e3813 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp @@ -57,29 +57,29 @@ static_assert(cuda::mr::async_resource>, static_assert(cuda::mr::async_resource>, ""); // test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 -struct derived_managed_resource : cuda::mr::cuda_managed_memory_resource +struct derived_managed_resource : cuda::mr::managed_memory_resource { - using cuda::mr::cuda_managed_memory_resource::cuda_managed_memory_resource; + using cuda::mr::managed_memory_resource::managed_memory_resource; }; static_assert(cuda::mr::resource, ""); void test() { - cuda::mr::cuda_managed_memory_resource first{}; - { // comparison against a plain cuda_managed_memory_resource - cuda::mr::cuda_managed_memory_resource second{}; + cuda::mr::managed_memory_resource first{}; + { // comparison against a plain managed_memory_resource + cuda::mr::managed_memory_resource second{}; assert(first == second); assert(!(first != second)); } - { // comparison against a plain cuda_managed_memory_resource with a different flag set - cuda::mr::cuda_managed_memory_resource second{cudaMemAttachHost}; + { // comparison against a plain managed_memory_resource with a different flag set + cuda::mr::managed_memory_resource second{cudaMemAttachHost}; assert(!(first == second)); assert((first != second)); } - { // comparison against a cuda_managed_memory_resource wrapped inside a resource_ref<> - cuda::mr::cuda_managed_memory_resource second{}; + { // comparison against a managed_memory_resource wrapped inside a resource_ref<> + cuda::mr::managed_memory_resource second{}; assert(first == cuda::mr::resource_ref<>{second}); assert(!(first != cuda::mr::resource_ref<>{second})); assert(cuda::mr::resource_ref<>{second} == first); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp similarity index 95% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/traits.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp index 3909ac7238..02b9bd0294 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp @@ -15,7 +15,7 @@ #include #include -using resource = cuda::mr::cuda_pinned_memory_resource; +using resource = cuda::mr::managed_memory_resource; static_assert(!cuda::std::is_trivial::value, ""); static_assert(!cuda::std::is_trivially_default_constructible::value, ""); static_assert(cuda::std::is_trivially_copy_constructible::value, ""); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp similarity index 97% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/allocate.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp index 7b9e374805..3ad0ae106b 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/allocate.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp @@ -30,7 +30,7 @@ void ensure_pinned_host_ptr(void* ptr) void test(const unsigned int flag) { - cuda::mr::cuda_pinned_memory_resource res{flag}; + cuda::mr::pinned_memory_resource res{flag}; { // allocate / deallocate auto* ptr = res.allocate(42); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp similarity index 83% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp index dd480cc9f7..1d60ea1ecb 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp @@ -57,29 +57,29 @@ static_assert(cuda::mr::async_resource>, static_assert(cuda::mr::async_resource>, ""); // test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 -struct derived_pinned_resource : cuda::mr::cuda_pinned_memory_resource +struct derived_pinned_resource : cuda::mr::pinned_memory_resource { - using cuda::mr::cuda_pinned_memory_resource::cuda_pinned_memory_resource; + using cuda::mr::pinned_memory_resource::pinned_memory_resource; }; static_assert(cuda::mr::resource, ""); void test() { - cuda::mr::cuda_pinned_memory_resource first{}; - { // comparison against a plain cuda_pinned_memory_resource - cuda::mr::cuda_pinned_memory_resource second{cudaHostAllocDefault}; + cuda::mr::pinned_memory_resource first{}; + { // comparison against a plain pinned_memory_resource + cuda::mr::pinned_memory_resource second{cudaHostAllocDefault}; assert(first == second); assert(!(first != second)); } - { // comparison against a plain cuda_pinned_memory_resource with a different flag set - cuda::mr::cuda_pinned_memory_resource second{cudaHostAllocPortable}; + { // comparison against a plain pinned_memory_resource with a different flag set + cuda::mr::pinned_memory_resource second{cudaHostAllocPortable}; assert(!(first == second)); assert((first != second)); } - { // comparison against a cuda_pinned_memory_resource wrapped inside a resource_ref<> - cuda::mr::cuda_pinned_memory_resource second{}; + { // comparison against a pinned_memory_resource wrapped inside a resource_ref<> + cuda::mr::pinned_memory_resource second{}; cuda::mr::resource_ref<> second_ref{second}; assert(first == second_ref); assert(!(first != second_ref)); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp similarity index 95% rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/traits.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp index 299247ff2e..b0bbae9526 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp @@ -15,7 +15,7 @@ #include #include -using resource = cuda::mr::cuda_managed_memory_resource; +using resource = cuda::mr::pinned_memory_resource; static_assert(!cuda::std::is_trivial::value, ""); static_assert(!cuda::std::is_trivially_default_constructible::value, ""); static_assert(cuda::std::is_trivially_copy_constructible::value, ""); From 16d4fd3c96225366c826f60e947ec1c472ef3082 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Fri, 16 Aug 2024 04:44:21 -0700 Subject: [PATCH 30/33] Fix and simplify (#2197) * Fix and simplify * Make logic for non-constant evaluation simpler in C++14 and greater in * Remove use of `std::` in `` Co-authored-by: Michael Schellenberger Costa * Change bitops tests to prevent constant folding of runtime checks * Move bit and split implementation details from main header * Remove volatile from tests in bitops * Make Windows happy by using `unsigned long` * Work around being unable to use {} in c++ constexpr functions * Add a 'default to constexpr' interpretation of is_constant_evaluated for internal use in bitops * Make windows happy by reusing the default to constexpr hack * Make bitops tests definitely actually do runtime * Move fallbacks into relevant headers * Fix fallbacks being guarded by MSVC ifdef. * Keep the license --------- Co-authored-by: Michael Schellenberger Costa --- libcudacxx/include/cuda/std/__bit/clz.h | 153 ++++ libcudacxx/include/cuda/std/__bit/ctz.h | 155 ++++ libcudacxx/include/cuda/std/__bit/popc.h | 118 +++ .../std/__type_traits/is_constant_evaluated.h | 10 + libcudacxx/include/cuda/std/bit | 348 +++++++- .../cuda/std/detail/libcxx/include/bit | 815 ------------------ .../bit/bitops.count/countl_one.pass.cpp | 77 +- .../bit/bitops.count/countl_zero.pass.cpp | 55 +- .../bit/bitops.count/countr_one.pass.cpp | 73 +- .../bit/bitops.count/countr_zero.pass.cpp | 75 +- .../bit/bitops.count/popcount.pass.cpp | 73 +- 11 files changed, 990 insertions(+), 962 deletions(-) create mode 100644 libcudacxx/include/cuda/std/__bit/clz.h create mode 100644 libcudacxx/include/cuda/std/__bit/ctz.h create mode 100644 libcudacxx/include/cuda/std/__bit/popc.h delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/bit diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h new file mode 100644 index 0000000000..84dbcd686a --- /dev/null +++ b/libcudacxx/include/cuda/std/__bit/clz.h @@ -0,0 +1,153 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX__BIT_CLZ_H +#define _LIBCUDACXX__BIT_CLZ_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#if defined(_CCCL_COMPILER_MSVC) +# include +#endif + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz2(uint64_t __x, int __c) +{ + return !!(~__x & 0x2) ^ __c; +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz4(uint64_t __x, int __c) +{ + return __binary_clz2(__x >> 2 * !!(__x & 0xC), __c + 2 * !(__x & 0xC)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz8(uint64_t __x, int __c) +{ + return __binary_clz4(__x >> 4 * !!(__x & 0xF0), __c + 4 * !(__x & 0xF0)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz16(uint64_t __x, int __c) +{ + return __binary_clz8(__x >> 8 * !!(__x & 0xFF00), __c + 8 * !(__x & 0xFF00)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz32(uint64_t __x, int __c) +{ + return __binary_clz16(__x >> 16 * !!(__x & 0xFFFF0000), __c + 16 * !(__x & 0xFFFF0000)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz64(uint64_t __x) +{ + return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000)); +} + +#if !defined(_CCCL_COMPILER_MSVC) + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint32_t __x) noexcept +{ +# if defined(__CUDA_ARCH__) + return __binary_clz32(static_cast(__x), 0); // no device constexpr builtins +# else + return __builtin_clz(__x); +# endif +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint64_t __x) noexcept +{ +# if defined(__CUDA_ARCH__) + return __binary_clz64(__x); // no device constexpr builtins +# else + return __builtin_clzll(__x); +# endif +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x) noexcept +{ +# if _CCCL_STD_VER >= 2014 + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clz(__x);), (return __builtin_clz(__x);)) + } +# endif + return __constexpr_clz(__x); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint64_t __x) noexcept +{ +# if _CCCL_STD_VER >= 2014 + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzll(__x);)) + } +# endif + return __constexpr_clz(__x); +} + +#else // defined(_CCCL_COMPILER_MSVC) + +// Precondition: __x != 0 +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x) +{ +# if !defined(__CUDA_ARCH__) + if (!__libcpp_default_is_constant_evaluated()) + { + unsigned long __where = 0; + if (_BitScanReverse(&__where, __x)) + { + return static_cast(31 - __where); + } + return 32; // Undefined Behavior. + } +# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) + + return __binary_clz32(static_cast(__x), 0); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint64_t __x) +{ +# if !defined(__CUDA_ARCH__) + if (!__libcpp_default_is_constant_evaluated()) + { + unsigned long __where = 0; +# if defined(_LIBCUDACXX_HAS_BITSCAN64) + if (_BitScanReverse64(&__where, __x)) + { + return static_cast(63 - __where); + } +# else + // Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls. + if (_BitScanReverse(&__where, static_cast(__x >> 32))) + { + return static_cast(63 - (__where + 32)); + } + if (_BitScanReverse(&__where, static_cast(__x))) + { + return static_cast(63 - __where); + } +# endif + return 64; // Undefined Behavior. + } +# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) + + return __binary_clz64(static_cast(__x)); +} + +#endif + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX__BIT_CLZ_H diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h new file mode 100644 index 0000000000..4715386921 --- /dev/null +++ b/libcudacxx/include/cuda/std/__bit/ctz.h @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX__BIT_CTZ_H +#define _LIBCUDACXX__BIT_CTZ_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#if defined(_CCCL_COMPILER_MSVC) +# include +#endif + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz2(uint64_t __x, int __c) noexcept +{ + return __c + !(__x & 0x1); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz4(uint64_t __x, int __c) noexcept +{ + return __binary_ctz2(__x >> 2 * !(__x & 0x3), __c + 2 * !(__x & 0x3)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz8(uint64_t __x, int __c) noexcept +{ + return __binary_ctz4(__x >> 4 * !(__x & 0x0F), __c + 4 * !(__x & 0x0F)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz16(uint64_t __x, int __c) noexcept +{ + return __binary_ctz8(__x >> 8 * !(__x & 0x00FF), __c + 8 * !(__x & 0x00FF)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz32(uint64_t __x, int __c) noexcept +{ + return __binary_ctz16(__x >> 16 * !(__x & 0x0000FFFF), __c + 16 * !(__x & 0x0000FFFF)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz64(uint64_t __x) noexcept +{ + return __binary_ctz32(__x >> 32 * !(__x & 0x00000000FFFFFFFF), 32 * !(__x & 0x00000000FFFFFFFF)); +} + +#if !defined(_CCCL_COMPILER_MSVC) + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint32_t __x) noexcept +{ +# if defined(__CUDA_ARCH__) + return __binary_ctz32(static_cast(__x), 0); // no device constexpr builtins +# else + return __builtin_ctz(__x); +# endif +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint64_t __x) noexcept +{ +# if defined(__CUDA_ARCH__) + return __binary_ctz64(__x); // no device constexpr builtins +# else + return __builtin_ctzll(__x); +# endif +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x) noexcept +{ +# if _CCCL_STD_VER >= 2014 + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return (!__x) ? (sizeof(uint32_t) * 8) : (__ffs(__x) - 1);), (return __builtin_ctz(__x);)) + } +# endif + return __constexpr_ctz(__x); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint64_t __x) noexcept +{ +# if _CCCL_STD_VER >= 2014 + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return (!__x) ? (sizeof(uint64_t) * 8) : (__ffsll(__x) - 1);), (return __builtin_ctzll(__x);)) + } +# endif + return __constexpr_ctz(__x); +} + +#else // defined(_CCCL_COMPILER_MSVC) + +// Precondition: __x != 0 +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x) +{ +# if !defined(__CUDA_ARCH__) + if (!__libcpp_default_is_constant_evaluated()) + { + unsigned long __where = 0; + if (_BitScanForward(&__where, __x)) + { + return static_cast(__where); + } + return 32; + } +# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) + + return __binary_ctz32(static_cast(__x), 0); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint64_t __x) +{ +# if !defined(__CUDA_ARCH__) + if (!__libcpp_default_is_constant_evaluated()) + { + unsigned long __where = 0; +# if defined(_LIBCUDACXX_HAS_BITSCAN64) && (defined(_M_AMD64) || defined(__x86_64__)) + if (_BitScanForward64(&__where, __x)) + { + return static_cast(__where); + } +# else + // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls. + if (_BitScanForward(&__where, static_cast(__x))) + { + return static_cast(__where); + } + if (_BitScanForward(&__where, static_cast(__x >> 32))) + { + return static_cast(__where + 32); + } +# endif + return 64; + } +# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) + + return __binary_ctz64(__x); +} + +#endif + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX__BIT_CTZ_H diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h new file mode 100644 index 0000000000..23b24a2bb0 --- /dev/null +++ b/libcudacxx/include/cuda/std/__bit/popc.h @@ -0,0 +1,118 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX__BIT_POPC_H +#define _LIBCUDACXX__BIT_POPC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#if defined(_CCCL_COMPILER_MSVC) +# include +#endif + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc8(uint64_t __x) +{ + return static_cast((__x * 0x0101010101010101) >> 56); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc16(uint64_t __x) +{ + return __fallback_popc8((__x + (__x >> 4)) & 0x0f0f0f0f0f0f0f0f); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc32(uint64_t __x) +{ + return __fallback_popc16((__x & 0x3333333333333333) + ((__x >> 2) & 0x3333333333333333)); +} +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc64(uint64_t __x) +{ + return __fallback_popc32(__x - ((__x >> 1) & 0x5555555555555555)); +} + +#if !defined(_CCCL_COMPILER_MSVC) + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint32_t __x) noexcept +{ +# if defined(__CUDA_ARCH__) + return __fallback_popc64(static_cast(__x)); // no device constexpr builtins +# else + return __builtin_popcount(__x); +# endif +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint64_t __x) noexcept +{ +# if defined(__CUDA_ARCH__) + return __fallback_popc64(static_cast(__x)); // no device constexpr builtins +# else + return __builtin_popcountll(__x); +# endif +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x) noexcept +{ +# if _CCCL_STD_VER >= 2014 + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popc(__x);), (return __builtin_popcount(__x);)) + } +# endif + return __constexpr_popcount(static_cast(__x)); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint64_t __x) noexcept +{ +# if _CCCL_STD_VER >= 2014 + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountll(__x);)) + } +# endif + return __constexpr_popcount(static_cast(__x)); +} + +#else // defined(_CCCL_COMPILER_MSVC) + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x) +{ + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_HOST, (return static_cast(__popcnt(__x));)) + } + + return __fallback_popc64(static_cast(__x)); +} + +inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint64_t __x) +{ + if (!__libcpp_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_HOST, (return static_cast(__popcnt64(__x));)) + } + + return __fallback_popc64(static_cast(__x)); +} + +#endif // MSVC + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX__BIT_POPC_H diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h index 6d667ab45f..577561a6b2 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h @@ -12,6 +12,8 @@ #include +#include "cuda/std/detail/libcxx/include/__config" + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -32,11 +34,19 @@ inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_is_constant_evaluat { return _LIBCUDACXX_IS_CONSTANT_EVALUATED(); } +inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_default_is_constant_evaluated() noexcept +{ + return _LIBCUDACXX_IS_CONSTANT_EVALUATED(); +} #else // ^^^ _LIBCUDACXX_IS_CONSTANT_EVALUATED ^^^ / vvv !_LIBCUDACXX_IS_CONSTANT_EVALUATED vvv inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_is_constant_evaluated() noexcept { return false; } +inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_default_is_constant_evaluated() noexcept +{ + return true; +} #endif // !_LIBCUDACXX_IS_CONSTANT_EVALUATED _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/bit b/libcudacxx/include/cuda/std/bit index 0460e078d7..9106fa588f 100644 --- a/libcudacxx/include/cuda/std/bit +++ b/libcudacxx/include/cuda/std/bit @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -21,9 +21,353 @@ # pragma system_header #endif // no system header +#include +#include +#include +#include +#include +#include +#include // all public C++ headers provide the assertion handler +#include +#include +#include + _CCCL_PUSH_MACROS -#include +#if defined(_CCCL_COMPILER_IBM) +# include +#endif // _CCCL_COMPILER_IBM + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __rotl(_Tp __t, uint32_t __cnt) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned"); + using __nlt = numeric_limits<_Tp>; + + return ((__cnt % __nlt::digits) == 0) + ? __t + : (__t << (__cnt % __nlt::digits)) | (__t >> (__nlt::digits - (__cnt % __nlt::digits))); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __rotr(_Tp __t, uint32_t __cnt) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned"); + using __nlt = numeric_limits<_Tp>; + + return ((__cnt % __nlt::digits) == 0) + ? __t + : (__t >> (__cnt % __nlt::digits)) | (__t << (__nlt::digits - (__cnt % __nlt::digits))); +} + +// Forward decl for recursive use in split word operations +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_zero(_Tp __t) noexcept; + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t +__countr_zero_dispatch(_Tp __t) noexcept +{ + return __libcpp_ctz(static_cast(__t)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t +__countr_zero_dispatch(_Tp __t) noexcept +{ + return __libcpp_ctz(static_cast(__t)); +} + +template +struct __countr_zero_rsh_impl +{ + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __short_circuit(_Tp __t, int __cur, int __count) + { + // Stops processing early if non-zero + return (__cur == numeric_limits::digits) + ? __countr_zero_rsh_impl<_Tp, _St - 1>::__count(__t, __cur + __count) + : __cur + __count; + } + + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t, int __count) + { + return __short_circuit(__t >> numeric_limits::digits, __countr_zero(static_cast(__t)), __count); + } +}; + +template +struct __countr_zero_rsh_impl<_Tp, 1> +{ + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t, int __count) + { + return __count + __countr_zero(static_cast(__t)); + } +}; + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int> +__countr_zero_dispatch(_Tp __t) noexcept +{ + return __countr_zero_rsh_impl<_Tp>::__count(__t, 0); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_zero(_Tp __t) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned"); + + return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits; +} + +// Forward decl for recursive use in split word operations +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_zero(_Tp __t) noexcept; + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t +__countl_zero_dispatch(_Tp __t) noexcept +{ + return __libcpp_clz(static_cast(__t)) - (numeric_limits::digits - numeric_limits<_Tp>::digits); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t +__countl_zero_dispatch(_Tp __t) noexcept +{ + return __libcpp_clz(static_cast(__t)) - (numeric_limits::digits - numeric_limits<_Tp>::digits); +} + +template +struct __countl_zero_rotl_impl +{ + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __short_circuit(_Tp __t, int __cur) + { + // This stops processing early if the current word is not empty + return (__cur == numeric_limits::digits) + ? __cur + __countl_zero_rotl_impl<_Tp, _St - 1>::__count(__t) + : __cur; + } + + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_iter(_Tp __t) + { + // After rotating pass result of clz to another step for processing + return __short_circuit(__t, __countl_zero(static_cast(__t))); + } + + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t) + { + return __countl_iter(__rotl(__t, numeric_limits::digits)); + } +}; + +template +struct __countl_zero_rotl_impl<_Tp, 1> +{ + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t) + { + return __countl_zero(static_cast(__rotl(__t, numeric_limits::digits))); + } +}; + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int> +__countl_zero_dispatch(_Tp __t) noexcept +{ + return __countl_zero_rotl_impl<_Tp>::__count(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_zero(_Tp __t) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned"); + return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_one(_Tp __t) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned"); + return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_one(_Tp __t) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned"); + return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t +__popcount_dispatch(_Tp __t) noexcept +{ + return __libcpp_popc(static_cast(__t)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t +__popcount_dispatch(_Tp __t) noexcept +{ + return __libcpp_popc(static_cast(__t)); +} + +template +struct __popcount_rsh_impl +{ + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t) + { + return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits::digits) + + __libcpp_popc(static_cast(__t)); + } +}; + +template +struct __popcount_rsh_impl<_Tp, 1> +{ + static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t) + { + return __libcpp_popc(static_cast(__t)); + } +}; + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int> +__popcount_dispatch(_Tp __t) noexcept +{ + return __popcount_rsh_impl<_Tp>::__count(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr int __popcount(_Tp __t) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned"); + + return __popcount_dispatch(__t); +} + +// integral log base 2 +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr uint32_t __bit_log2(_Tp __t) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned"); + return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr bool __has_single_bit(_Tp __t) noexcept +{ + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned"); + return __t != 0 && (((__t & (__t - 1)) == 0)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept +{ + return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t __ceil2(_Tp __t) noexcept +{ + return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))) + + (numeric_limits::digits - numeric_limits<_Tp>::digits))) + >> (numeric_limits::digits - numeric_limits<_Tp>::digits)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> +rotl(_Tp __t, uint32_t __cnt) noexcept +{ + return __rotl(__t, __cnt); +} + +// rotr +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> +rotr(_Tp __t, uint32_t __cnt) noexcept +{ + return __rotr(__t, __cnt); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> +countl_zero(_Tp __t) noexcept +{ + return __countl_zero(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> +countl_one(_Tp __t) noexcept +{ + return __countl_one(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> +countr_zero(_Tp __t) noexcept +{ + return __countr_zero(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> +countr_one(_Tp __t) noexcept +{ + return __countr_one(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> +popcount(_Tp __t) noexcept +{ + return __popcount(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool> +has_single_bit(_Tp __t) noexcept +{ + return __has_single_bit(__t); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> +bit_floor(_Tp __t) noexcept +{ + return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> +bit_ceil(_Tp __t) noexcept +{ + return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> +bit_width(_Tp __t) noexcept +{ + return __t == 0 ? 0 : static_cast<_Tp>(__bit_log2(__t) + 1); +} + +enum class endian +{ + little = 0xDEAD, + big = 0xFACE, +#if defined(_LIBCUDACXX_LITTLE_ENDIAN) + native = little +#elif defined(_LIBCUDACXX_BIG_ENDIAN) + native = big +#else + native = 0xCAFE +#endif +}; + +_LIBCUDACXX_END_NAMESPACE_STD _CCCL_POP_MACROS diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/bit b/libcudacxx/include/cuda/std/detail/libcxx/include/bit deleted file mode 100644 index 641a743832..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/bit +++ /dev/null @@ -1,815 +0,0 @@ -// -*- C++ -*- -//===------------------------------ bit ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===---------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX_BIT -#define _LIBCUDACXX_BIT - -/* - bit synopsis - -namespace std { - - template - constexpr bool has_single_bit(T x) noexcept; // C++20 - template - constexpr T bit_ceil(T x); // C++20 - template - constexpr T bit_floor(T x) noexcept; // C++20 - template - constexpr T bit_width(T x) noexcept; // C++20 - - // 23.20.2, rotating - template - constexpr T rotl(T x, unsigned int s) noexcept; // C++20 - template - constexpr T rotr(T x, unsigned int s) noexcept; // C++20 - - // 23.20.3, counting - template - constexpr int countl_zero(T x) noexcept; // C++20 - template - constexpr int countl_one(T x) noexcept; // C++20 - template - constexpr int countr_zero(T x) noexcept; // C++20 - template - constexpr int countr_one(T x) noexcept; // C++20 - template - constexpr int popcount(T x) noexcept; // C++20 - - // 20.15.9, endian - enum class endian { - little = see below, // C++20 - big = see below, // C++20 - native = see below // C++20 -}; - -} // namespace std - -*/ - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include // all public C++ headers provide the assertion handler -#include -#include -#include -#include - -_CCCL_PUSH_MACROS - -#if defined(_CCCL_COMPILER_MSVC) -# include -#endif // _CCCL_COMPILER_MSVC - -#if defined(_CCCL_COMPILER_IBM) -# include -#endif // _CCCL_COMPILER_IBM - -_LIBCUDACXX_BEGIN_NAMESPACE_STD - -#define _LIBCUDACXX_BIT_CONSTEXPR constexpr - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz2(uint64_t __x, int __c) noexcept -{ - return (__x & 0x1) ? __c : __c + 1; -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz4(uint64_t __x, int __c) noexcept -{ - return __binary_ctz2(__x >> 2 * !(__x & 0x3), __c + 2 * !(__x & 0x3)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz8(uint64_t __x, int __c) noexcept -{ - return __binary_ctz4(__x >> 4 * !(__x & 0x0F), __c + 4 * !(__x & 0x0F)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz16(uint64_t __x, int __c) noexcept -{ - return __binary_ctz8(__x >> 8 * !(__x & 0x00FF), __c + 8 * !(__x & 0x00FF)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz32(uint64_t __x, int __c) noexcept -{ - return __binary_ctz16(__x >> 16 * !(__x & 0x0000FFFF), __c + 16 * !(__x & 0x0000FFFF)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz64(uint64_t __x) noexcept -{ - return __binary_ctz32(__x >> 32 * !(__x & 0x00000000FFFFFFFF), 32 * !(__x & 0x00000000FFFFFFFF)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz2(uint64_t __x, int __c) -{ - return !!(~__x & 0x2) ^ __c; -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz4(uint64_t __x, int __c) -{ - return __binary_clz2(__x >> 2 * !!(__x & 0xC), __c + 2 * !(__x & 0xC)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz8(uint64_t __x, int __c) -{ - return __binary_clz4(__x >> 4 * !!(__x & 0xF0), __c + 4 * !(__x & 0xF0)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz16(uint64_t __x, int __c) -{ - return __binary_clz8(__x >> 8 * !!(__x & 0xFF00), __c + 8 * !(__x & 0xFF00)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz32(uint64_t __x, int __c) -{ - return __binary_clz16(__x >> 16 * !!(__x & 0xFFFF0000), __c + 16 * !(__x & 0xFFFF0000)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz64(uint64_t __x) -{ - return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc8(uint64_t __x) -{ - return static_cast((__x * 0x0101010101010101) >> 56); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc16(uint64_t __x) -{ - return __fallback_popc8((__x + (__x >> 4)) & 0x0f0f0f0f0f0f0f0f); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc32(uint64_t __x) -{ - return __fallback_popc16((__x & 0x3333333333333333) + ((__x >> 2) & 0x3333333333333333)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc64(uint64_t __x) -{ - return __fallback_popc32(__x - ((__x >> 1) & 0x5555555555555555)); -} - -#ifndef _CCCL_COMPILER_MSVC - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET( - NV_IS_DEVICE, (return (!__x) ? sizeof(unsigned) * 8 : __ffs(__x) - 1;), (return __builtin_ctz(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __binary_ctz32(static_cast(__x), 0); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_ctz(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_ctz(unsigned long __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET( - NV_IS_DEVICE, (return (!__x) ? sizeof(unsigned long) * 8 : __ffsll(__x) - 1;), (return __builtin_ctzl(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __binary_ctz64(static_cast(__x)); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_ctzl(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_ctz(unsigned long long __x) noexcept -{ -// For whatever reason __builtin_ctzll does not compile although it should -# if 1 // def _CCCL_COMPILER_NVRTC -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (return (!__x) ? sizeof(unsigned long long) * 8 : __ffsll(__x) - 1;), - (return __builtin_ctzll(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __binary_ctz64(static_cast(__x)); -# else // 0 - return __builtin_ctzll(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_clz(unsigned __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clz(__x);), (return __builtin_clz(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - return __binary_clz32(static_cast(__x), 0); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_clz(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_clz(unsigned long __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzl(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __binary_clz64(static_cast(__x)); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_clzl(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_clz(unsigned long long __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzll(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __binary_clz64(static_cast(__x)); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_clzll(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_popcount(unsigned __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popc(__x);), (return __builtin_popcount(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __fallback_popc64(static_cast(__x)); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_popcount(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_popcount(unsigned long __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountl(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __fallback_popc64(static_cast(__x)); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_popcountl(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_popcount(unsigned long long __x) noexcept -{ -# if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3)) -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - if (!__libcpp_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountll(__x);)) - } -# endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014) - - return __fallback_popc64(static_cast(__x)); -# else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv - return __builtin_popcountll(__x); -# endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 -} - -#else // _CCCL_COMPILER_MSVC - -// Precondition: __x != 0 -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned __x) -{ - static_assert(sizeof(unsigned) == sizeof(unsigned long), ""); - static_assert(sizeof(unsigned long) == 4, ""); -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__) - if (!__libcpp_is_constant_evaluated()) - { - unsigned long __where = 0; - if (_BitScanForward(&__where, __x)) - { - return static_cast(__where); - } - return 32; - } -# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) - - return __binary_ctz32(static_cast(__x), 0); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned long __x) -{ - static_assert(sizeof(unsigned long) == sizeof(unsigned), ""); - return __libcpp_ctz(static_cast(__x)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned long long __x) -{ -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__) - if (!__libcpp_is_constant_evaluated()) - { - unsigned long __where = 0; -# if defined(_LIBCUDACXX_HAS_BITSCAN64) && (defined(_M_AMD64) || defined(__x86_64__)) - if (_BitScanForward64(&__where, __x)) - { - return static_cast(__where); - } -# else - // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls. - if (_BitScanForward(&__where, static_cast(__x))) - { - return static_cast(__where); - } - if (_BitScanForward(&__where, static_cast(__x >> 32))) - { - return static_cast(__where + 32); - } -# endif - return 64; - } -# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) - - return __binary_ctz64(__x); -} - -// Precondition: __x != 0 -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(unsigned __x) -{ - static_assert(sizeof(unsigned) == sizeof(unsigned long), ""); - static_assert(sizeof(unsigned long) == 4, ""); -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__) - if (!__libcpp_is_constant_evaluated()) - { - unsigned long __where = 0; - if (_BitScanReverse(&__where, __x)) - { - return static_cast(31 - __where); - } - return 32; // Undefined Behavior. - } -# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) - - return __binary_clz32(static_cast(__x), 0); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(unsigned long __x) -{ - static_assert(sizeof(unsigned) == sizeof(unsigned long), ""); - return __libcpp_clz(static_cast(__x)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(unsigned long long __x) -{ -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__) - if (!__libcpp_is_constant_evaluated()) - { - unsigned long __where = 0; -# if defined(_LIBCUDACXX_HAS_BITSCAN64) - if (_BitScanReverse64(&__where, __x)) - { - return static_cast(63 - __where); - } -# else - // Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls. - if (_BitScanReverse(&__where, static_cast(__x >> 32))) - { - return static_cast(63 - (__where + 32)); - } - if (_BitScanReverse(&__where, static_cast(__x))) - { - return static_cast(63 - __where); - } -# endif - return 64; // Undefined Behavior. - } -# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) - - return __binary_clz64(static_cast(__x)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popcount(unsigned __x) -{ - static_assert(sizeof(unsigned) == 4, ""); -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__) - if (!__libcpp_is_constant_evaluated()) - { - return static_cast(__popcnt(__x)); - } -# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) - - return __fallback_popc64(static_cast(__x)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popcount(unsigned long __x) -{ - static_assert(sizeof(unsigned long) == 4, ""); -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__) - if (!__libcpp_is_constant_evaluated()) - { - return static_cast(__popcnt(__x)); - } -# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__) - - return __fallback_popc64(static_cast(__x)); -} - -inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popcount(unsigned long long __x) -{ - static_assert(sizeof(unsigned long long) == 8, ""); -# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__) - if (!__libcpp_is_constant_evaluated()) - { - return static_cast(__popcnt64(__x)); - } -# endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__ - - return __fallback_popc64(static_cast(__x)); -} - -#endif // _CCCL_COMPILER_MSVC - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR _Tp __rotl(_Tp __t, unsigned int __cnt) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned"); - using __nlt = numeric_limits<_Tp>; - - return ((__cnt % __nlt::digits) == 0) - ? __t - : (__t << (__cnt % __nlt::digits)) | (__t >> (__nlt::digits - (__cnt % __nlt::digits))); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR _Tp __rotr(_Tp __t, unsigned int __cnt) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned"); - using __nlt = numeric_limits<_Tp>; - - return ((__cnt % __nlt::digits) == 0) - ? __t - : (__t >> (__cnt % __nlt::digits)) | (__t << (__nlt::digits - (__cnt % __nlt::digits))); -} - -// Forward decl for recursive use in split word operations -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countr_zero(_Tp __t) noexcept; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t -__countr_zero_dispatch(_Tp __t) noexcept -{ - return __libcpp_ctz(static_cast(__t)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t -__countr_zero_dispatch(_Tp __t) noexcept -{ - return __libcpp_ctz(static_cast(__t)); -} - -template -struct __countr_zero_rsh_impl -{ - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __short_circuit(_Tp __t, int __cur, int __count) - { - // Stops processing early if non-zero - return (__cur == numeric_limits::digits) - ? __countr_zero_rsh_impl<_Tp, _St - 1>::__count(__t, __cur + __count) - : __cur + __count; - } - - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t, int __count) - { - return __short_circuit( - __t >> numeric_limits::digits, __countr_zero(static_cast(__t)), __count); - } -}; - -template -struct __countr_zero_rsh_impl<_Tp, 1> -{ - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t, int __count) - { - return __count + __countr_zero(static_cast(__t)); - } -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int> -__countr_zero_dispatch(_Tp __t) noexcept -{ - return __countr_zero_rsh_impl<_Tp>::__count(__t, 0); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countr_zero(_Tp __t) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned"); - - return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits; -} - -// Forward decl for recursive use in split word operations -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_zero(_Tp __t) noexcept; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t -__countl_zero_dispatch(_Tp __t) noexcept -{ - return __libcpp_clz(static_cast(__t)) - - (numeric_limits::digits - numeric_limits<_Tp>::digits); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t -__countl_zero_dispatch(_Tp __t) noexcept -{ - return __libcpp_clz(static_cast(__t)) - - (numeric_limits::digits - numeric_limits<_Tp>::digits); -} - -template -struct __countl_zero_rotl_impl -{ - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __short_circuit(_Tp __t, int __cur) - { - // This stops processing early if the current word is not empty - return (__cur == numeric_limits::digits) - ? __cur + __countl_zero_rotl_impl<_Tp, _St - 1>::__count(__t) - : __cur; - } - - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_iter(_Tp __t) - { - // After rotating pass result of clz to another step for processing - return __short_circuit(__t, __countl_zero(static_cast(__t))); - } - - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t) - { - return __countl_iter(__rotl(__t, numeric_limits::digits)); - } -}; - -template -struct __countl_zero_rotl_impl<_Tp, 1> -{ - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t) - { - return __countl_zero(static_cast(__rotl(__t, numeric_limits::digits))); - } -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int> -__countl_zero_dispatch(_Tp __t) noexcept -{ - return __countl_zero_rotl_impl<_Tp>::__count(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_zero(_Tp __t) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned"); - return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_one(_Tp __t) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned"); - return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countr_one(_Tp __t) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned"); - return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t -__popcount_dispatch(_Tp __t) noexcept -{ - return __libcpp_popcount(static_cast(__t)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t -__popcount_dispatch(_Tp __t) noexcept -{ - return __libcpp_popcount(static_cast(__t)); -} - -template -struct __popcount_rsh_impl -{ - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t) - { - return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits::digits) - + __libcpp_popcount(static_cast(__t)); - } -}; - -template -struct __popcount_rsh_impl<_Tp, 1> -{ - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t) - { - return __libcpp_popcount(static_cast(__t)); - } -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int> -__popcount_dispatch(_Tp __t) noexcept -{ - return __popcount_rsh_impl<_Tp>::__count(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __popcount(_Tp __t) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned"); - - return __popcount_dispatch(__t); -} - -// integral log base 2 -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR unsigned __bit_log2(_Tp __t) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned"); - return std::numeric_limits<_Tp>::digits - 1 - __countl_zero(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR bool __has_single_bit(_Tp __t) noexcept -{ - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned"); - return __t != 0 && (((__t & (__t - 1)) == 0)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t= sizeof(unsigned), _Tp> -__ceil2(_Tp __t) noexcept -{ - // const unsigned __n = numeric_limits<_Tp>::digits - countl_zero((_Tp)(__t - 1u)); - // _LIBCUDACXX_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to - // ceil2"); - return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t -__ceil2(_Tp __t) noexcept -{ - // const unsigned __n = numeric_limits<_Tp>::digits - countl_zero((_Tp)(__t - 1u)); - // _LIBCUDACXX_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to - // ceil2"); - - // const unsigned __extra = numeric_limits::digits - numeric_limits<_Tp>::digits; - // const unsigned __retVal = 1u << (__n + __extra); - return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))) - + (numeric_limits::digits - numeric_limits<_Tp>::digits))) - >> (numeric_limits::digits - numeric_limits<_Tp>::digits)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> -rotl(_Tp __t, unsigned int __cnt) noexcept -{ - return __rotl(__t, __cnt); -} - -// rotr -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> -rotr(_Tp __t, unsigned int __cnt) noexcept -{ - return __rotr(__t, __cnt); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -countl_zero(_Tp __t) noexcept -{ - return __countl_zero(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -countl_one(_Tp __t) noexcept -{ - return __countl_one(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -countr_zero(_Tp __t) noexcept -{ - return __countr_zero(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -countr_one(_Tp __t) noexcept -{ - return __countr_one(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -popcount(_Tp __t) noexcept -{ - return __popcount(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool> -has_single_bit(_Tp __t) noexcept -{ - return __has_single_bit(__t); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> -bit_floor(_Tp __t) noexcept -{ - return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> -bit_ceil(_Tp __t) noexcept -{ - return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> -bit_width(_Tp __t) noexcept -{ - return __t == 0 ? 0 : static_cast<_Tp>(__bit_log2(__t) + 1); -} - -enum class endian -{ - little = 0xDEAD, - big = 0xFACE, -#if defined(_LIBCUDACXX_LITTLE_ENDIAN) - native = little -#elif defined(_LIBCUDACXX_BIG_ENDIAN) - native = big -#else - native = 0xCAFE -#endif -}; - -_LIBCUDACXX_END_NAMESPACE_STD - -_CCCL_POP_MACROS - -#endif // _LIBCUDACXX_BIT diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp index 111f6f0331..a2b9cdf2d0 100644 --- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp @@ -39,15 +39,26 @@ template __host__ __device__ constexpr bool constexpr_test() { using nl = cuda::std::numeric_limits; - return cuda::std::countl_one(nl::max()) == nl::digits && cuda::std::countl_one(T(nl::max() - 1)) == nl::digits - 1 - && cuda::std::countl_one(T(nl::max() - 2)) == nl::digits - 2 - && cuda::std::countl_one(T(nl::max() - 3)) == nl::digits - 2 - && cuda::std::countl_one(T(nl::max() - 4)) == nl::digits - 3 - && cuda::std::countl_one(T(nl::max() - 5)) == nl::digits - 3 - && cuda::std::countl_one(T(nl::max() - 6)) == nl::digits - 3 - && cuda::std::countl_one(T(nl::max() - 7)) == nl::digits - 3 - && cuda::std::countl_one(T(nl::max() - 8)) == nl::digits - 4 - && cuda::std::countl_one(T(nl::max() - 9)) == nl::digits - 4; + + static_assert(cuda::std::countl_one(nl::max()) == nl::digits, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 1)) == nl::digits - 1, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 2)) == nl::digits - 2, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 3)) == nl::digits - 2, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 4)) == nl::digits - 3, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 5)) == nl::digits - 3, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 6)) == nl::digits - 3, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 7)) == nl::digits - 3, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 8)) == nl::digits - 4, ""); + static_assert(cuda::std::countl_one(T(nl::max() - 9)) == nl::digits - 4, ""); + + return true; +} + +template +__host__ __device__ inline void assert_countl_one(T val, int expected) +{ + volatile auto v = val; + assert(cuda::std::countl_one(v) == expected); } template @@ -57,36 +68,36 @@ __host__ __device__ void runtime_test() ASSERT_NOEXCEPT(cuda::std::countl_one(T(0))); const int dig = cuda::std::numeric_limits::digits; - assert(cuda::std::countl_one(T(~121)) == dig - 7); - assert(cuda::std::countl_one(T(~122)) == dig - 7); - assert(cuda::std::countl_one(T(~123)) == dig - 7); - assert(cuda::std::countl_one(T(~124)) == dig - 7); - assert(cuda::std::countl_one(T(~125)) == dig - 7); - assert(cuda::std::countl_one(T(~126)) == dig - 7); - assert(cuda::std::countl_one(T(~127)) == dig - 7); - assert(cuda::std::countl_one(T(~128)) == dig - 8); - assert(cuda::std::countl_one(T(~129)) == dig - 8); - assert(cuda::std::countl_one(T(~130)) == dig - 8); + assert_countl_one(T(~121), dig - 7); + assert_countl_one(T(~122), dig - 7); + assert_countl_one(T(~123), dig - 7); + assert_countl_one(T(~124), dig - 7); + assert_countl_one(T(~125), dig - 7); + assert_countl_one(T(~126), dig - 7); + assert_countl_one(T(~127), dig - 7); + assert_countl_one(T(~128), dig - 8); + assert_countl_one(T(~129), dig - 8); + assert_countl_one(T(~130), dig - 8); } int main(int, char**) { - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - static_assert(constexpr_test<__uint128_t>(), ""); + constexpr_test<__uint128_t>(); #endif runtime_test(); diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp index 5f2bab54d6..929d5c3d69 100644 --- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp @@ -53,6 +53,13 @@ __host__ __device__ constexpr bool constexpr_test() return true; } +template +__host__ __device__ inline void assert_countl_zero(T val, int expected) +{ + volatile auto v = val; + assert(cuda::std::countl_zero(v) == expected); +} + template __host__ __device__ void runtime_test() { @@ -60,36 +67,36 @@ __host__ __device__ void runtime_test() ASSERT_NOEXCEPT(cuda::std::countl_zero(T(0))); const int dig = cuda::std::numeric_limits::digits; - assert(cuda::std::countl_zero(T(121)) == dig - 7); - assert(cuda::std::countl_zero(T(122)) == dig - 7); - assert(cuda::std::countl_zero(T(123)) == dig - 7); - assert(cuda::std::countl_zero(T(124)) == dig - 7); - assert(cuda::std::countl_zero(T(125)) == dig - 7); - assert(cuda::std::countl_zero(T(126)) == dig - 7); - assert(cuda::std::countl_zero(T(127)) == dig - 7); - assert(cuda::std::countl_zero(T(128)) == dig - 8); - assert(cuda::std::countl_zero(T(129)) == dig - 8); - assert(cuda::std::countl_zero(T(130)) == dig - 8); + assert_countl_zero(T(121), dig - 7); + assert_countl_zero(T(122), dig - 7); + assert_countl_zero(T(123), dig - 7); + assert_countl_zero(T(124), dig - 7); + assert_countl_zero(T(125), dig - 7); + assert_countl_zero(T(126), dig - 7); + assert_countl_zero(T(127), dig - 7); + assert_countl_zero(T(128), dig - 8); + assert_countl_zero(T(129), dig - 8); + assert_countl_zero(T(130), dig - 8); } int main(int, char**) { - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - static_assert(constexpr_test<__uint128_t>(), ""); + constexpr_test<__uint128_t>(); #endif runtime_test(); diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp index cf80a8a5a7..74e81bb119 100644 --- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp @@ -38,11 +38,26 @@ enum class E2 : unsigned char template __host__ __device__ constexpr bool constexpr_test() { - return cuda::std::countr_one(T(0)) == 0 && cuda::std::countr_one(T(1)) == 1 && cuda::std::countr_one(T(2)) == 0 - && cuda::std::countr_one(T(3)) == 2 && cuda::std::countr_one(T(4)) == 0 && cuda::std::countr_one(T(5)) == 1 - && cuda::std::countr_one(T(6)) == 0 && cuda::std::countr_one(T(7)) == 3 && cuda::std::countr_one(T(8)) == 0 - && cuda::std::countr_one(T(9)) == 1 - && cuda::std::countr_one(cuda::std::numeric_limits::max()) == cuda::std::numeric_limits::digits; + static_assert(cuda::std::countr_one(T(2)) == 0, ""); + static_assert(cuda::std::countr_one(T(3)) == 2, ""); + static_assert(cuda::std::countr_one(T(4)) == 0, ""); + static_assert(cuda::std::countr_one(T(5)) == 1, ""); + static_assert(cuda::std::countr_one(T(6)) == 0, ""); + static_assert(cuda::std::countr_one(T(7)) == 3, ""); + static_assert(cuda::std::countr_one(T(8)) == 0, ""); + static_assert(cuda::std::countr_one(T(9)) == 1, ""); + static_assert(cuda::std::countr_one(T(0)) == 0, ""); + static_assert(cuda::std::countr_one(T(1)) == 1, ""); + static_assert(cuda::std::countr_one(cuda::std::numeric_limits::max()) == cuda::std::numeric_limits::digits, ""); + + return true; +} + +template +__host__ __device__ inline void assert_countr_one(T val, int expected) +{ + volatile auto v = val; + assert(cuda::std::countr_one(v) == expected); } template @@ -51,36 +66,36 @@ __host__ __device__ void runtime_test() ASSERT_SAME_TYPE(int, decltype(cuda::std::countr_one(T(0)))); ASSERT_NOEXCEPT(cuda::std::countr_one(T(0))); - assert(cuda::std::countr_one(T(121)) == 1); - assert(cuda::std::countr_one(T(122)) == 0); - assert(cuda::std::countr_one(T(123)) == 2); - assert(cuda::std::countr_one(T(124)) == 0); - assert(cuda::std::countr_one(T(125)) == 1); - assert(cuda::std::countr_one(T(126)) == 0); - assert(cuda::std::countr_one(T(127)) == 7); - assert(cuda::std::countr_one(T(128)) == 0); - assert(cuda::std::countr_one(T(129)) == 1); - assert(cuda::std::countr_one(T(130)) == 0); + assert_countr_one(T(121), 1); + assert_countr_one(T(122), 0); + assert_countr_one(T(123), 2); + assert_countr_one(T(124), 0); + assert_countr_one(T(125), 1); + assert_countr_one(T(126), 0); + assert_countr_one(T(127), 7); + assert_countr_one(T(128), 0); + assert_countr_one(T(129), 1); + assert_countr_one(T(130), 0); } int main(int, char**) { - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - static_assert(constexpr_test<__uint128_t>(), ""); + constexpr_test<__uint128_t>(); #endif runtime_test(); diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp index 4c4da8cead..75a552ccf9 100644 --- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp @@ -11,7 +11,7 @@ // template // constexpr int countr_zero(T x) noexcept; -// Returns: The number of consecutive 0 bits, starting from the most significant bit. +// Returns: The number of consecutive 0 bits, starting from the least significant bit. // [ Note: Returns N if x == 0. ] // // Remarks: This function shall not participate in overload resolution unless @@ -38,11 +38,26 @@ enum class E2 : unsigned char template __host__ __device__ constexpr bool constexpr_test() { - return cuda::std::countr_zero(T(0)) == cuda::std::numeric_limits::digits && cuda::std::countr_zero(T(1)) == 0 - && cuda::std::countr_zero(T(2)) == 1 && cuda::std::countr_zero(T(3)) == 0 && cuda::std::countr_zero(T(4)) == 2 - && cuda::std::countr_zero(T(5)) == 0 && cuda::std::countr_zero(T(6)) == 1 && cuda::std::countr_zero(T(7)) == 0 - && cuda::std::countr_zero(T(8)) == 3 && cuda::std::countr_zero(T(9)) == 0 - && cuda::std::countr_zero(cuda::std::numeric_limits::max()) == 0; + static_assert(cuda::std::countr_zero(T(1)) == 0, ""); + static_assert(cuda::std::countr_zero(T(2)) == 1, ""); + static_assert(cuda::std::countr_zero(T(3)) == 0, ""); + static_assert(cuda::std::countr_zero(T(4)) == 2, ""); + static_assert(cuda::std::countr_zero(T(5)) == 0, ""); + static_assert(cuda::std::countr_zero(T(6)) == 1, ""); + static_assert(cuda::std::countr_zero(T(7)) == 0, ""); + static_assert(cuda::std::countr_zero(T(8)) == 3, ""); + static_assert(cuda::std::countr_zero(T(9)) == 0, ""); + static_assert(cuda::std::countr_zero(T(0)) == cuda::std::numeric_limits::digits, ""); + static_assert(cuda::std::countr_zero(cuda::std::numeric_limits::max()) == 0, ""); + + return true; +} + +template +__host__ __device__ inline void assert_countr_zero(T val, int expected) +{ + volatile auto v = val; + assert(cuda::std::countr_zero(v) == expected); } template @@ -51,36 +66,36 @@ __host__ __device__ void runtime_test() ASSERT_SAME_TYPE(int, decltype(cuda::std::countr_zero(T(0)))); ASSERT_NOEXCEPT(cuda::std::countr_zero(T(0))); - assert(cuda::std::countr_zero(T(121)) == 0); - assert(cuda::std::countr_zero(T(122)) == 1); - assert(cuda::std::countr_zero(T(123)) == 0); - assert(cuda::std::countr_zero(T(124)) == 2); - assert(cuda::std::countr_zero(T(125)) == 0); - assert(cuda::std::countr_zero(T(126)) == 1); - assert(cuda::std::countr_zero(T(127)) == 0); - assert(cuda::std::countr_zero(T(128)) == 7); - assert(cuda::std::countr_zero(T(129)) == 0); - assert(cuda::std::countr_zero(T(130)) == 1); + assert_countr_zero(T(121), 0); + assert_countr_zero(T(122), 1); + assert_countr_zero(T(123), 0); + assert_countr_zero(T(124), 2); + assert_countr_zero(T(125), 0); + assert_countr_zero(T(126), 1); + assert_countr_zero(T(127), 0); + assert_countr_zero(T(128), 7); + assert_countr_zero(T(129), 0); + assert_countr_zero(T(130), 1); } int main(int, char**) { - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - static_assert(constexpr_test<__uint128_t>(), ""); + constexpr_test<__uint128_t>(); #endif runtime_test(); diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp index 8e70c9ae56..393b9d31bd 100644 --- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp @@ -42,11 +42,26 @@ enum class E2 : unsigned char template __host__ __device__ constexpr bool constexpr_test() { - return cuda::std::popcount(T(0)) == 0 && cuda::std::popcount(T(1)) == 1 && cuda::std::popcount(T(2)) == 1 - && cuda::std::popcount(T(3)) == 2 && cuda::std::popcount(T(4)) == 1 && cuda::std::popcount(T(5)) == 2 - && cuda::std::popcount(T(6)) == 2 && cuda::std::popcount(T(7)) == 3 && cuda::std::popcount(T(8)) == 1 - && cuda::std::popcount(T(9)) == 2 - && cuda::std::popcount(cuda::std::numeric_limits::max()) == cuda::std::numeric_limits::digits; + static_assert(cuda::std::popcount(T(0)) == 0, ""); + static_assert(cuda::std::popcount(T(1)) == 1, ""); + static_assert(cuda::std::popcount(T(2)) == 1, ""); + static_assert(cuda::std::popcount(T(3)) == 2, ""); + static_assert(cuda::std::popcount(T(4)) == 1, ""); + static_assert(cuda::std::popcount(T(5)) == 2, ""); + static_assert(cuda::std::popcount(T(6)) == 2, ""); + static_assert(cuda::std::popcount(T(7)) == 3, ""); + static_assert(cuda::std::popcount(T(8)) == 1, ""); + static_assert(cuda::std::popcount(T(9)) == 2, ""); + static_assert(cuda::std::popcount(cuda::std::numeric_limits::max()) == cuda::std::numeric_limits::digits, ""); + + return true; +} + +template +__host__ __device__ inline void assert_popcount(T val, int expected) +{ + volatile auto v = val; + assert(cuda::std::popcount(v) == expected); } template @@ -55,36 +70,36 @@ __host__ __device__ void runtime_test() ASSERT_SAME_TYPE(int, decltype(cuda::std::popcount(T(0)))); ASSERT_NOEXCEPT(cuda::std::popcount(T(0))); - assert(cuda::std::popcount(T(121)) == 5); - assert(cuda::std::popcount(T(122)) == 5); - assert(cuda::std::popcount(T(123)) == 6); - assert(cuda::std::popcount(T(124)) == 5); - assert(cuda::std::popcount(T(125)) == 6); - assert(cuda::std::popcount(T(126)) == 6); - assert(cuda::std::popcount(T(127)) == 7); - assert(cuda::std::popcount(T(128)) == 1); - assert(cuda::std::popcount(T(129)) == 2); - assert(cuda::std::popcount(T(130)) == 2); + assert_popcount(T(121), 5); + assert_popcount(T(122), 5); + assert_popcount(T(123), 6); + assert_popcount(T(124), 5); + assert_popcount(T(125), 6); + assert_popcount(T(126), 6); + assert_popcount(T(127), 7); + assert_popcount(T(128), 1); + assert_popcount(T(129), 2); + assert_popcount(T(130), 2); } int main(int, char**) { - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); - static_assert(constexpr_test(), ""); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); + constexpr_test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - static_assert(constexpr_test<__uint128_t>(), ""); + constexpr_test<__uint128_t>(); #endif runtime_test(); From fed3ec1abe2d603e22ee12fa9a61010ae9b9b553 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 16 Aug 2024 16:41:27 +0200 Subject: [PATCH 31/33] Proclaim pair and tuple trivially relocatable (#2010) --- .../cuda/std/detail/libcxx/include/complex | 3 + thrust/testing/type_traits.cu | 59 +++++++++++++++++++ thrust/thrust/pair.h | 8 +++ thrust/thrust/tuple.h | 6 ++ .../type_traits/is_trivially_relocatable.h | 16 +++++ 5 files changed, 92 insertions(+) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex index dc596bd65b..b03b7d9ee6 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex @@ -1467,6 +1467,9 @@ inline namespace literals inline namespace complex_literals { # ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE +// NOTE: if you get a warning from GCC <7 here that "literal operator suffixes not preceded by ‘_’ are reserved for +// future standardization" then we are sorry. The warning was implemented before GCC 7, but can only be disabled since +// GCC 7. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69523 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex operator""il(long double __im) { return {0.0l, __im}; diff --git a/thrust/testing/type_traits.cu b/thrust/testing/type_traits.cu index bab73c76c4..f4ba3d0896 100644 --- a/thrust/testing/type_traits.cu +++ b/thrust/testing/type_traits.cu @@ -5,8 +5,17 @@ #include #include #include +#include +#include #include +#if defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000 +// This header pulls in an unsuppressable warning on GCC 6 +# include +#endif // defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000 +#include +#include + #include void TestIsContiguousIterator() @@ -146,3 +155,53 @@ void TestIsCommutative() } } DECLARE_UNITTEST(TestIsCommutative); + +struct NonTriviallyCopyable +{ + NonTriviallyCopyable(const NonTriviallyCopyable&) {} +}; +THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(NonTriviallyCopyable); + +static_assert(!::cuda::std::is_trivially_copyable::value, ""); +static_assert(thrust::is_trivially_relocatable::value, ""); + +void TestTriviallyRelocatable() +{ + static_assert(thrust::is_trivially_relocatable::value, ""); +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + static_assert(thrust::is_trivially_relocatable<__half>::value, ""); + static_assert(thrust::is_trivially_relocatable::value, ""); + static_assert(thrust::is_trivially_relocatable::value, ""); + static_assert(thrust::is_trivially_relocatable::value, ""); + static_assert(thrust::is_trivially_relocatable::value, ""); +# ifndef _LIBCUDACXX_HAS_NO_INT128 + static_assert(thrust::is_trivially_relocatable<__int128>::value, ""); +# endif // _LIBCUDACXX_HAS_NO_INT128 +#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA +#if defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000 + static_assert(thrust::is_trivially_relocatable>::value, ""); + static_assert(thrust::is_trivially_relocatable<::cuda::std::complex>::value, ""); + static_assert(thrust::is_trivially_relocatable>>::value, ""); + static_assert(thrust::is_trivially_relocatable<::cuda::std::pair>>::value, ""); + static_assert(thrust::is_trivially_relocatable, char>>::value, ""); + static_assert(thrust::is_trivially_relocatable<::cuda::std::tuple, char>>::value, + ""); +#endif // defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000 + static_assert(thrust::is_trivially_relocatable< + ::cuda::std::tuple>>, + thrust::tuple<::cuda::std::pair>, int>>>::value, + ""); + + static_assert(!thrust::is_trivially_relocatable>::value, ""); + static_assert(!thrust::is_trivially_relocatable<::cuda::std::pair>::value, ""); + static_assert(!thrust::is_trivially_relocatable>::value, ""); + static_assert(!thrust::is_trivially_relocatable<::cuda::std::tuple>::value, ""); + + // test propagation of relocatability through pair and tuple + static_assert(thrust::is_trivially_relocatable::value, ""); + static_assert(thrust::is_trivially_relocatable>::value, ""); + static_assert(thrust::is_trivially_relocatable<::cuda::std::pair>::value, ""); + static_assert(thrust::is_trivially_relocatable>::value, ""); + static_assert(thrust::is_trivially_relocatable<::cuda::std::tuple>::value, ""); +}; +DECLARE_UNITTEST(TestTriviallyRelocatable); diff --git a/thrust/thrust/pair.h b/thrust/thrust/pair.h index 0e567a35b6..def1aeaf17 100644 --- a/thrust/thrust/pair.h +++ b/thrust/thrust/pair.h @@ -30,6 +30,9 @@ # pragma system_header #endif // no system header +#include + +#include #include THRUST_NAMESPACE_BEGIN @@ -117,6 +120,11 @@ make_pair(T1&& t1, T2&& t2) using _CUDA_VSTD::get; +template +struct proclaim_trivially_relocatable> + : ::cuda::std::conjunction, is_trivially_relocatable> +{}; + /*! \endcond */ diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h index 1ff1031804..3e7cd87dae 100644 --- a/thrust/thrust/tuple.h +++ b/thrust/thrust/tuple.h @@ -39,6 +39,8 @@ # pragma system_header #endif // no system header +#include + #include #include #include @@ -232,6 +234,10 @@ inline _CCCL_HOST_DEVICE tuple tie(Ts&... ts) noexcept using _CUDA_VSTD::get; +template +struct proclaim_trivially_relocatable> : ::cuda::std::conjunction...> +{}; + /*! \endcond */ diff --git a/thrust/thrust/type_traits/is_trivially_relocatable.h b/thrust/thrust/type_traits/is_trivially_relocatable.h index 7732d7c6ab..8566a51057 100644 --- a/thrust/thrust/type_traits/is_trivially_relocatable.h +++ b/thrust/thrust/type_traits/is_trivially_relocatable.h @@ -36,6 +36,10 @@ #include #include +#include +#include +#include + #include THRUST_NAMESPACE_BEGIN @@ -285,6 +289,18 @@ THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double3) THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double4) #endif +THRUST_NAMESPACE_BEGIN +template +struct proclaim_trivially_relocatable<::cuda::std::pair> + : ::cuda::std::conjunction, is_trivially_relocatable> +{}; + +template +struct proclaim_trivially_relocatable<::cuda::std::tuple> + : ::cuda::std::conjunction...> +{}; +THRUST_NAMESPACE_END + /*! \endcond */ From 4a5dcc4f9e3ebaddcc05f2fb1b243d852f7cea99 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 16 Aug 2024 18:55:37 +0200 Subject: [PATCH 32/33] Make `cuda::std::min` constexpr in C++11 (#2249) This should fix our rmm builds --- libcudacxx/include/cuda/std/__algorithm/min.h | 8 ++------ .../std/algorithms/alg.sorting/alg.min.max/min.pass.cpp | 4 ++++ .../algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp | 4 ++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/libcudacxx/include/cuda/std/__algorithm/min.h b/libcudacxx/include/cuda/std/__algorithm/min.h index 5d1d826038..047d3eb294 100644 --- a/libcudacxx/include/cuda/std/__algorithm/min.h +++ b/libcudacxx/include/cuda/std/__algorithm/min.h @@ -30,20 +30,18 @@ _CCCL_PUSH_MACROS _LIBCUDACXX_BEGIN_NAMESPACE_STD template -_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& +_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& min(const _Tp& __a, const _Tp& __b, _Compare __comp) { return __comp(__b, __a) ? __b : __a; } template -_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& min(const _Tp& __a, const _Tp& __b) +_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& min(const _Tp& __a, const _Tp& __b) { return _CUDA_VSTD::min(__a, __b, __less{}); } -#ifndef _LIBCUDACXX_CXX03_LANG - template _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp min(initializer_list<_Tp> __t, _Compare __comp) @@ -57,8 +55,6 @@ _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp m return *_CUDA_VSTD::min_element(__t.begin(), __t.end(), __less{}); } -#endif // _LIBCUDACXX_CXX03_LANG - _LIBCUDACXX_END_NAMESPACE_STD _CCCL_POP_MACROS diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp index 9f66fc5468..8d35bf42af 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp @@ -54,6 +54,10 @@ int main(int, char**) test(); #if TEST_STD_VER >= 2014 static_assert(test(), ""); +#else // TEST_STD_VER >= 2014 + constexpr int x = 0; + constexpr int y = 1; + static_assert(&cuda::std::min(x, y) == &x, ""); #endif // TEST_STD_VER >= 2014 return 0; diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp index aac001f744..b08c1948a2 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp @@ -56,6 +56,10 @@ int main(int, char**) test(); #if TEST_STD_VER >= 2014 static_assert(test(), ""); +#else // TEST_STD_VER >= 2014 + constexpr int x = 0; + constexpr int y = 1; + static_assert(&cuda::std::min(x, y, cuda::std::greater()) == &y, ""); #endif // TEST_STD_VER >= 2014 return 0; From ba9e9bbc20dca1ac49b5da6b4d1716d85b4f495e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 16 Aug 2024 19:33:47 +0200 Subject: [PATCH 33/33] Add `CCCL_DISABLE_NVTX` macro (#2173) Fixes: #2172 --- cub/cub/detail/nvtx.cuh | 15 +++++++++++---- cub/test/test_nvtx_disabled.cu | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 cub/test/test_nvtx_disabled.cu diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh index d570df3adc..a8422263fa 100644 --- a/cub/cub/detail/nvtx.cuh +++ b/cub/cub/detail/nvtx.cuh @@ -37,11 +37,16 @@ # pragma system_header #endif // no system header +#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: +//! When this macro is defined, no NVTX ranges are emitted by CCCL +# define CCCL_DISABLE_NVTX +#endif // DOXYGEN_SHOULD_SKIP_THIS + // Enable the functionality of this header if: // * The NVTX3 C API is available in CTK -// * NVTX is not explicitly disabled +// * NVTX is not explicitly disabled (via CCCL_DISABLE_NVTX or NVTX_DISABLE) // * C++14 is availabl for cuda::std::optional -#if __has_include( ) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014 +#if __has_include( ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014 // Include our NVTX3 C++ wrapper if not available from the CTK # if __has_include() // TODO(bgruber): replace by a check for the first CTK version shipping the header # include @@ -96,7 +101,9 @@ CUB_NAMESPACE_END # define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name) # define CUB_DETAIL_NVTX_RANGE_SCOPE(name) # endif // NVTX3_CPP_DEFINITIONS_V1_0 -#else // __has_include( ) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014 +#else // __has_include( ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER + // >= 2014 # define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name) # define CUB_DETAIL_NVTX_RANGE_SCOPE(name) -#endif // __has_include( ) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014 +#endif // __has_include( ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER + // >= 2014 diff --git a/cub/test/test_nvtx_disabled.cu b/cub/test/test_nvtx_disabled.cu new file mode 100644 index 0000000000..c6eba196b1 --- /dev/null +++ b/cub/test/test_nvtx_disabled.cu @@ -0,0 +1,19 @@ +#define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) static_assert(false, ""); +#define CCCL_DISABLE_NVTX + +#include + +#include + +#include + +#if defined(CCCL_DISABLE_NVTX) && defined(NVTX_VERSION) +# error "NVTX was included somewhere even though it is turned off via CCCL_DISABLE_NVTX" +#endif // defined(CCCL_DISABLE_NVTX) && defined(NVTX_VERSION) + +int main() +{ + thrust::counting_iterator it{0}; + cub::DeviceFor::ForEach(it, it + 16, ::cuda::std::negate{}); + cudaDeviceSynchronize(); +}