From fe27d99255f43eacef77ccf9d308234d3532eafd Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Mon, 5 Aug 2024 16:27:15 +0200
Subject: [PATCH 01/33] Fix the `clang-format` path in the devcotnainers
 (#2194)

In the devcontainers `clang-format` is now installed into `/usr/bin/clang-format`
---
 .devcontainer/cuda11.1-gcc6/devcontainer.json           | 2 +-
 .devcontainer/cuda11.1-gcc7/devcontainer.json           | 2 +-
 .devcontainer/cuda11.1-gcc8/devcontainer.json           | 2 +-
 .devcontainer/cuda11.1-gcc9/devcontainer.json           | 2 +-
 .devcontainer/cuda11.1-llvm9/devcontainer.json          | 2 +-
 .devcontainer/cuda11.8-gcc11/devcontainer.json          | 2 +-
 .devcontainer/cuda12.0-gcc10/devcontainer.json          | 2 +-
 .devcontainer/cuda12.0-gcc11/devcontainer.json          | 2 +-
 .devcontainer/cuda12.0-gcc12/devcontainer.json          | 2 +-
 .devcontainer/cuda12.0-gcc9/devcontainer.json           | 2 +-
 .devcontainer/cuda12.0-llvm10/devcontainer.json         | 2 +-
 .devcontainer/cuda12.0-llvm11/devcontainer.json         | 2 +-
 .devcontainer/cuda12.0-llvm12/devcontainer.json         | 2 +-
 .devcontainer/cuda12.0-llvm13/devcontainer.json         | 2 +-
 .devcontainer/cuda12.0-llvm14/devcontainer.json         | 2 +-
 .devcontainer/cuda12.0-llvm9/devcontainer.json          | 2 +-
 .devcontainer/cuda12.5-gcc10/devcontainer.json          | 2 +-
 .devcontainer/cuda12.5-gcc11/devcontainer.json          | 2 +-
 .devcontainer/cuda12.5-gcc12/devcontainer.json          | 2 +-
 .devcontainer/cuda12.5-gcc13/devcontainer.json          | 2 +-
 .devcontainer/cuda12.5-gcc7/devcontainer.json           | 2 +-
 .devcontainer/cuda12.5-gcc8/devcontainer.json           | 2 +-
 .devcontainer/cuda12.5-gcc9/devcontainer.json           | 2 +-
 .devcontainer/cuda12.5-llvm10/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm11/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm12/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm13/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm14/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm15/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm16/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm17/devcontainer.json         | 2 +-
 .devcontainer/cuda12.5-llvm9/devcontainer.json          | 2 +-
 .devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json | 2 +-
 .devcontainer/devcontainer.json                         | 2 +-
 34 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json
index ed345016ec..cd810c13dd 100644
--- a/.devcontainer/cuda11.1-gcc6/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json
index b1ff078547..9db4454383 100644
--- a/.devcontainer/cuda11.1-gcc7/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json
index f480d0003a..143b42abdf 100644
--- a/.devcontainer/cuda11.1-gcc8/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json
index a622e14519..e5aaa70339 100644
--- a/.devcontainer/cuda11.1-gcc9/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json
index 3eaa29a8b8..ccf1bd9a81 100644
--- a/.devcontainer/cuda11.1-llvm9/devcontainer.json
+++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
index 4d03dc2de0..e8d2c3d94e 100644
--- a/.devcontainer/cuda11.8-gcc11/devcontainer.json
+++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json
index 1371a181a9..9c1c1c3328 100644
--- a/.devcontainer/cuda12.0-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json
index 2096821c11..c86d5cba2d 100644
--- a/.devcontainer/cuda12.0-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json
index e99c8debae..af192d3938 100644
--- a/.devcontainer/cuda12.0-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json
index 3154808232..434b1b69f2 100644
--- a/.devcontainer/cuda12.0-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json
index b4bf89b341..15f4c622f9 100644
--- a/.devcontainer/cuda12.0-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json
index b87d457cb7..f3bc6a558a 100644
--- a/.devcontainer/cuda12.0-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json
index 829ec1cb2e..032c783fef 100644
--- a/.devcontainer/cuda12.0-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json
index 60abc033be..eb700e0615 100644
--- a/.devcontainer/cuda12.0-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json
index a48b0bcd0c..935275ed5b 100644
--- a/.devcontainer/cuda12.0-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json
index 465478e431..7127d6c2db 100644
--- a/.devcontainer/cuda12.0-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json
index 5a59153bf3..b16f5b5d4d 100644
--- a/.devcontainer/cuda12.5-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc10/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.5-gcc11/devcontainer.json
index 42b668abf1..c3c5ca3199 100644
--- a/.devcontainer/cuda12.5-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc11/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json
index d807d4cd30..f3996dac8e 100644
--- a/.devcontainer/cuda12.5-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json
index 01364fdbc2..74031d3657 100644
--- a/.devcontainer/cuda12.5-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.5-gcc7/devcontainer.json
index a632769505..88f0060a87 100644
--- a/.devcontainer/cuda12.5-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc7/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.5-gcc8/devcontainer.json
index f0aff7ba7b..9f8b6020c5 100644
--- a/.devcontainer/cuda12.5-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc8/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.5-gcc9/devcontainer.json
index e050d23303..422a20c62b 100644
--- a/.devcontainer/cuda12.5-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc9/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.5-llvm10/devcontainer.json
index 0cda7b0a66..028509f6ef 100644
--- a/.devcontainer/cuda12.5-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm10/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.5-llvm11/devcontainer.json
index 1a513873f1..5f4d3f4c1d 100644
--- a/.devcontainer/cuda12.5-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm11/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.5-llvm12/devcontainer.json
index a11a351e30..2b9ecc320b 100644
--- a/.devcontainer/cuda12.5-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm12/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.5-llvm13/devcontainer.json
index 0136655f0c..933ad59af7 100644
--- a/.devcontainer/cuda12.5-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm13/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.5-llvm14/devcontainer.json
index dd9d6a62f0..72e7e0275d 100644
--- a/.devcontainer/cuda12.5-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm14/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.5-llvm15/devcontainer.json
index 51fd6a1466..c3086986e9 100644
--- a/.devcontainer/cuda12.5-llvm15/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm15/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.5-llvm16/devcontainer.json
index 882025ddaf..2db6386576 100644
--- a/.devcontainer/cuda12.5-llvm16/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm16/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.5-llvm17/devcontainer.json
index 55fa86ff53..44fb4cbec7 100644
--- a/.devcontainer/cuda12.5-llvm17/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm17/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.5-llvm9/devcontainer.json
index 3b2a328c2e..f95daf28f0 100644
--- a/.devcontainer/cuda12.5-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm9/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
index 5e4b04e19b..92e692ae14 100644
--- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
+++ b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 01364fdbc2..74031d3657 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -38,7 +38,7 @@
       "settings": {
         "editor.defaultFormatter": "xaver.clang-format",
         "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clang-format.executable": "/usr/bin/clang-format",
         "clangd.arguments": [
           "--compile-commands-dir=${workspaceFolder}"
         ],

From d1e7c1cc82df61d3a59569c8995fbc652c3f1f7c Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Mon, 5 Aug 2024 11:11:29 -0700
Subject: [PATCH 02/33] Mount a build directory for CCCL projects if WSL is
 detected (#2035)

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .devcontainer/cuda11.1-gcc6/devcontainer.json           | 6 ++++--
 .devcontainer/cuda11.1-gcc7/devcontainer.json           | 6 ++++--
 .devcontainer/cuda11.1-gcc8/devcontainer.json           | 6 ++++--
 .devcontainer/cuda11.1-gcc9/devcontainer.json           | 6 ++++--
 .devcontainer/cuda11.1-llvm9/devcontainer.json          | 6 ++++--
 .devcontainer/cuda11.8-gcc11/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.0-gcc10/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.0-gcc11/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.0-gcc12/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.0-gcc9/devcontainer.json           | 6 ++++--
 .devcontainer/cuda12.0-llvm10/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.0-llvm11/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.0-llvm12/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.0-llvm13/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.0-llvm14/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.0-llvm9/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.5-gcc10/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.5-gcc11/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.5-gcc12/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.5-gcc13/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.5-gcc7/devcontainer.json           | 6 ++++--
 .devcontainer/cuda12.5-gcc8/devcontainer.json           | 6 ++++--
 .devcontainer/cuda12.5-gcc9/devcontainer.json           | 6 ++++--
 .devcontainer/cuda12.5-llvm10/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm11/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm12/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm13/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm14/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm15/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm16/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm17/devcontainer.json         | 6 ++++--
 .devcontainer/cuda12.5-llvm9/devcontainer.json          | 6 ++++--
 .devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json | 6 ++++--
 .devcontainer/devcontainer.json                         | 6 ++++--
 34 files changed, 136 insertions(+), 68 deletions(-)

diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json
index cd810c13dd..401a33ba59 100644
--- a/.devcontainer/cuda11.1-gcc6/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json
index 9db4454383..5298b39143 100644
--- a/.devcontainer/cuda11.1-gcc7/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json
index 143b42abdf..27ca3c28a0 100644
--- a/.devcontainer/cuda11.1-gcc8/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json
index e5aaa70339..ff592b79f5 100644
--- a/.devcontainer/cuda11.1-gcc9/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json
index ccf1bd9a81..e8a167e1c9 100644
--- a/.devcontainer/cuda11.1-llvm9/devcontainer.json
+++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
index e8d2c3d94e..cdb8a4250b 100644
--- a/.devcontainer/cuda11.8-gcc11/devcontainer.json
+++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json
index 9c1c1c3328..4d081b3125 100644
--- a/.devcontainer/cuda12.0-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json
index c86d5cba2d..4dd297c412 100644
--- a/.devcontainer/cuda12.0-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json
index af192d3938..660e98109f 100644
--- a/.devcontainer/cuda12.0-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json
index 434b1b69f2..1f781d5852 100644
--- a/.devcontainer/cuda12.0-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json
index 15f4c622f9..e716b5eb6c 100644
--- a/.devcontainer/cuda12.0-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json
index f3bc6a558a..399b306075 100644
--- a/.devcontainer/cuda12.0-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json
index 032c783fef..d977c15ff3 100644
--- a/.devcontainer/cuda12.0-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json
index eb700e0615..016695d62c 100644
--- a/.devcontainer/cuda12.0-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json
index 935275ed5b..1fc144da6f 100644
--- a/.devcontainer/cuda12.0-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json
index 7127d6c2db..8bd0756dd4 100644
--- a/.devcontainer/cuda12.0-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json
index b16f5b5d4d..61459a25fc 100644
--- a/.devcontainer/cuda12.5-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc10/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.5-gcc11/devcontainer.json
index c3c5ca3199..184de8734c 100644
--- a/.devcontainer/cuda12.5-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc11/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json
index f3996dac8e..1d16b6aa61 100644
--- a/.devcontainer/cuda12.5-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json
index 74031d3657..0f3fbb36f5 100644
--- a/.devcontainer/cuda12.5-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.5-gcc7/devcontainer.json
index 88f0060a87..9d5d356ad5 100644
--- a/.devcontainer/cuda12.5-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc7/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.5-gcc8/devcontainer.json
index 9f8b6020c5..10b44d31f1 100644
--- a/.devcontainer/cuda12.5-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc8/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.5-gcc9/devcontainer.json
index 422a20c62b..333c11b3cc 100644
--- a/.devcontainer/cuda12.5-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc9/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.5-llvm10/devcontainer.json
index 028509f6ef..8e3e19d4fc 100644
--- a/.devcontainer/cuda12.5-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm10/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.5-llvm11/devcontainer.json
index 5f4d3f4c1d..a216720e5d 100644
--- a/.devcontainer/cuda12.5-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm11/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.5-llvm12/devcontainer.json
index 2b9ecc320b..e1cbc4ecb7 100644
--- a/.devcontainer/cuda12.5-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm12/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.5-llvm13/devcontainer.json
index 933ad59af7..6fbbf56b79 100644
--- a/.devcontainer/cuda12.5-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm13/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.5-llvm14/devcontainer.json
index 72e7e0275d..b8528e989f 100644
--- a/.devcontainer/cuda12.5-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm14/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.5-llvm15/devcontainer.json
index c3086986e9..768d3163ee 100644
--- a/.devcontainer/cuda12.5-llvm15/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm15/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.5-llvm16/devcontainer.json
index 2db6386576..8ba700fa4e 100644
--- a/.devcontainer/cuda12.5-llvm16/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm16/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.5-llvm17/devcontainer.json
index 44fb4cbec7..0de5689fdc 100644
--- a/.devcontainer/cuda12.5-llvm17/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm17/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.5-llvm9/devcontainer.json
index f95daf28f0..d34ae01844 100644
--- a/.devcontainer/cuda12.5-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm9/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
index 92e692ae14..a530527cac 100644
--- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
+++ b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 74031d3657..0f3fbb36f5 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -7,7 +7,8 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
   ],
   "containerEnv": {
     "SCCACHE_REGION": "us-east-2",
@@ -25,7 +26,8 @@
   "mounts": [
     "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
   ],
   "customizations": {
     "vscode": {

From 75929cb688b244c2089a4c2ddf1b406d086c2ad9 Mon Sep 17 00:00:00 2001
From: pciolkosz <pciolkosz@nvidia.com>
Date: Mon, 5 Aug 2024 23:51:34 -0700
Subject: [PATCH 03/33] 2118 [CUDAX] Change the RAII device swapper to use
 driver API and add it in places where it was missing (#2192)

* Change __scoped_device to use driver API

* Switch to use driver API based dev setter

* Remove constexpr from operator device()

* Fix comments and includes

* Fallback to non-versioned get entry point pre 12.5
We need to use versioned version to get correct cuStreamGetCtx.
There is v2 version of it in 12.5, fortunatelly the versioned
get entry point is available there too

* Fix unused local variable

* Fix warnings in ensure_current_device test

* Move ensure current device out of detail

* Add LIBCUDACXX_ENABLE_EXCEPTIONS to tests cmake
---
 .../cuda/experimental/__device/device.cuh     |  32 ++++-
 .../cuda/experimental/__device/device_ref.cuh |  64 ---------
 .../cuda/experimental/__event/event.cuh       |  10 +-
 .../cuda/experimental/__event/event_ref.cuh   |   5 +-
 .../cuda/experimental/__event/timed_event.cuh |   2 +-
 .../cuda/experimental/__launch/launch.cuh     |   7 +
 .../cuda/experimental/__stream/stream.cuh     |  17 ++-
 .../experimental/__utility/driver_api.cuh     |  70 ++++++++-
 .../__utility/ensure_current_device.cuh       |  80 +++++++++++
 cudax/test/CMakeLists.txt                     |   2 +
 cudax/test/common/utility.cuh                 |  28 ++++
 cudax/test/device/device_smoke.cu             |   7 +-
 cudax/test/launch/configuration.cu            |   1 -
 cudax/test/launch/launch_smoke.cu             |   1 -
 cudax/test/stream/get_stream.cu               |   1 -
 cudax/test/stream/stream_smoke.cu             |   1 -
 cudax/test/utility/driver_api.cu              |  49 +++++--
 cudax/test/utility/ensure_current_device.cu   | 135 ++++++++++++++++++
 18 files changed, 412 insertions(+), 100 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
 create mode 100644 cudax/test/utility/ensure_current_device.cu

diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh
index f91b0089d5..35e0cfe2d4 100644
--- a/cudax/include/cuda/experimental/__device/device.cuh
+++ b/cudax/include/cuda/experimental/__device/device.cuh
@@ -21,7 +21,13 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda.h>
+
 #include <cuda/experimental/__device/device_ref.cuh>
+#include <cuda/experimental/__utility/driver_api.cuh>
+
+#include <cassert>
+#include <mutex>
 
 namespace cuda::experimental
 {
@@ -33,7 +39,7 @@ struct __emplace_device
 {
   int __id_;
 
-  _CCCL_NODISCARD constexpr operator device() const noexcept;
+  _CCCL_NODISCARD operator device() const noexcept;
 
   _CCCL_NODISCARD constexpr const __emplace_device* operator->() const noexcept;
 };
@@ -56,6 +62,24 @@ public:
 #  endif
 #endif
 
+  CUcontext primary_context() const
+  {
+    ::std::call_once(__init_once, [this]() {
+      __device      = detail::driver::deviceGet(__id_);
+      __primary_ctx = detail::driver::primaryCtxRetain(__device);
+    });
+    assert(__primary_ctx != nullptr);
+    return __primary_ctx;
+  }
+
+  ~device()
+  {
+    if (__primary_ctx)
+    {
+      detail::driver::primaryCtxRelease(__device);
+    }
+  }
+
 private:
   // TODO: put a mutable thread-safe (or thread_local) cache of device
   // properties here.
@@ -63,6 +87,10 @@ private:
   friend class device_ref;
   friend struct detail::__emplace_device;
 
+  mutable CUcontext __primary_ctx = nullptr;
+  mutable CUdevice __device{};
+  mutable ::std::once_flag __init_once;
+
   explicit constexpr device(int __id) noexcept
       : device_ref(__id)
   {}
@@ -76,7 +104,7 @@ private:
 
 namespace detail
 {
-_CCCL_NODISCARD inline constexpr __emplace_device::operator device() const noexcept
+_CCCL_NODISCARD inline __emplace_device::operator device() const noexcept
 {
   return device(__id_);
 }
diff --git a/cudax/include/cuda/experimental/__device/device_ref.cuh b/cudax/include/cuda/experimental/__device/device_ref.cuh
index f5945914da..7f2635611f 100644
--- a/cudax/include/cuda/experimental/__device/device_ref.cuh
+++ b/cudax/include/cuda/experimental/__device/device_ref.cuh
@@ -22,7 +22,6 @@
 #endif // no system header
 
 #include <cuda/std/__cuda/api_wrapper.h>
-#include <cuda/std/__type_traits/decay.h>
 
 namespace cuda::experimental
 {
@@ -103,69 +102,6 @@ public:
   }
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-
-//! @brief RAII helper which saves the current device and switches to the
-//!        specified device on construction and switches to the saved device on
-//!        destruction.
-//!
-struct __scoped_device
-{
-private:
-  // The original device ordinal, or -1 if the device was not changed.
-  int const __old_device;
-
-  //! @brief Returns the current device ordinal.
-  //!
-  //! @throws cuda_error if the device query fails.
-  static int __current_device()
-  {
-    int device = -1;
-    _CCCL_TRY_CUDA_API(cudaGetDevice, "failed to get the current device", &device);
-    return device;
-  }
-
-  explicit __scoped_device(int new_device, int old_device) noexcept
-      : __old_device(new_device == old_device ? -1 : old_device)
-  {}
-
-public:
-  //! @brief Construct a new `__scoped_device` object and switch to the specified
-  //!        device.
-  //!
-  //! @param new_device The device to switch to
-  //!
-  //! @throws cuda_error if the device switch fails
-  explicit __scoped_device(device_ref new_device)
-      : __scoped_device(new_device.get(), __current_device())
-  {
-    if (__old_device != -1)
-    {
-      _CCCL_TRY_CUDA_API(cudaSetDevice, "failed to set the current device", new_device.get());
-    }
-  }
-
-  __scoped_device(__scoped_device&&)                 = delete;
-  __scoped_device(__scoped_device const&)            = delete;
-  __scoped_device& operator=(__scoped_device&&)      = delete;
-  __scoped_device& operator=(__scoped_device const&) = delete;
-
-  //! @brief Destroy the `__scoped_device` object and switch back to the original
-  //!        device.
-  //!
-  //! @throws cuda_error if the device switch fails. If the destructor is called
-  //!         during stack unwinding, the program is automatically terminated.
-  ~__scoped_device() noexcept(false)
-  {
-    if (__old_device != -1)
-    {
-      _CCCL_TRY_CUDA_API(cudaSetDevice, "failed to restore the current device", __old_device);
-    }
-  }
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
 } // namespace cuda::experimental
 
 #endif // _CUDAX__DEVICE_DEVICE_REF
diff --git a/cudax/include/cuda/experimental/__event/event.cuh b/cudax/include/cuda/experimental/__event/event.cuh
index 0b6b7802b2..3ce997c55c 100644
--- a/cudax/include/cuda/experimental/__event/event.cuh
+++ b/cudax/include/cuda/experimental/__event/event.cuh
@@ -30,6 +30,7 @@
 
 #include <cuda/experimental/__detail/utility.cuh>
 #include <cuda/experimental/__event/event_ref.cuh>
+#include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 namespace cuda::experimental
 {
@@ -54,7 +55,7 @@ public:
   //!
   //! @throws cuda_error if the event creation fails.
   explicit event(stream_ref __stream, flags __flags = flags::none)
-      : event(static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
+      : event(__stream, static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
   {
     record(__stream);
   }
@@ -85,7 +86,9 @@ public:
   {
     if (__event_ != nullptr)
     {
-      [[maybe_unused]] auto __status = ::cudaEventDestroy(__event_);
+      // Needs to call driver API in case current device is not set, runtime version would set dev 0 current
+      // Alternative would be to store the device and push/pop here
+      [[maybe_unused]] auto __status = detail::driver::eventDestroy(__event_);
     }
   }
 
@@ -144,9 +147,10 @@ private:
       : event_ref(__evnt)
   {}
 
-  explicit event(unsigned int __flags)
+  explicit event(stream_ref __stream, unsigned int __flags)
       : event_ref(::cudaEvent_t{})
   {
+    [[maybe_unused]] __ensure_current_device __dev_setter(__stream);
     _CCCL_TRY_CUDA_API(
       ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
   }
diff --git a/cudax/include/cuda/experimental/__event/event_ref.cuh b/cudax/include/cuda/experimental/__event/event_ref.cuh
index b795d46a77..3b0ccc6dbc 100644
--- a/cudax/include/cuda/experimental/__event/event_ref.cuh
+++ b/cudax/include/cuda/experimental/__event/event_ref.cuh
@@ -30,6 +30,8 @@
 #include <cuda/std/utility>
 #include <cuda/stream_ref>
 
+#include <cuda/experimental/__utility/driver_api.cuh>
+
 namespace cuda::experimental
 {
 class event;
@@ -74,7 +76,8 @@ public:
   {
     assert(__event_ != nullptr);
     assert(__stream.get() != nullptr);
-    _CCCL_TRY_CUDA_API(::cudaEventRecord, "Failed to record CUDA event", __event_, __stream.get());
+    // Need to use driver API, cudaEventRecord will push dev 0 if stack is empty
+    detail::driver::eventRecord(__event_, __stream.get());
   }
 
   //! @brief Waits until all the work in the stream prior to the record of the
diff --git a/cudax/include/cuda/experimental/__event/timed_event.cuh b/cudax/include/cuda/experimental/__event/timed_event.cuh
index debcbcd26e..48b9b0f1a5 100644
--- a/cudax/include/cuda/experimental/__event/timed_event.cuh
+++ b/cudax/include/cuda/experimental/__event/timed_event.cuh
@@ -42,7 +42,7 @@ public:
   //!
   //! @throws cuda_error if the event creation fails.
   explicit timed_event(stream_ref __stream, flags __flags = flags::none)
-      : event(static_cast<unsigned int>(__flags))
+      : event(__stream, static_cast<unsigned int>(__flags))
   {
     record(__stream);
   }
diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
index 790af2a9d5..1a49cafa40 100644
--- a/cudax/include/cuda/experimental/__launch/launch.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -16,6 +16,7 @@
 #include <cuda/stream_ref>
 
 #include <cuda/experimental/__launch/configuration.cuh>
+#include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 #if _CCCL_STD_VER >= 2017
 namespace cuda::experimental
@@ -119,6 +120,7 @@ template <typename... Args, typename... Config, typename Dimensions, typename Ke
 void launch(
   ::cuda::stream_ref stream, const kernel_config<Dimensions, Config...>& conf, const Kernel& kernel, Args... args)
 {
+  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
   cudaError_t status;
   if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, Args...>)
   {
@@ -181,6 +183,7 @@ void launch(
 template <typename... Args, typename... Levels, typename Kernel>
 void launch(::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, const Kernel& kernel, Args... args)
 {
+  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
   cudaError_t status;
   if constexpr (::cuda::std::is_invocable_v<Kernel, hierarchy_dimensions<Levels...>, Args...>)
   {
@@ -245,6 +248,7 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(kernel_config<Dimensions, Config...>, ExpArgs...),
             ActArgs&&... args)
 {
+  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
   cudaError_t status = [&](ExpArgs... args) {
     return detail::launch_impl(stream, conf, kernel, conf, args...);
   }(std::forward<ActArgs>(args)...);
@@ -299,6 +303,7 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(hierarchy_dimensions<Levels...>, ExpArgs...),
             ActArgs&&... args)
 {
+  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
   cudaError_t status = [&](ExpArgs... args) {
     return detail::launch_impl(stream, kernel_config(dims), kernel, dims, args...);
   }(std::forward<ActArgs>(args)...);
@@ -354,6 +359,7 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(ExpArgs...),
             ActArgs&&... args)
 {
+  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
   cudaError_t status = [&](ExpArgs... args) {
     return detail::launch_impl(stream, conf, kernel, args...);
   }(std::forward<ActArgs>(args)...);
@@ -406,6 +412,7 @@ template <typename... ExpArgs, typename... ActArgs, typename... Levels>
 void launch(
   ::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, void (*kernel)(ExpArgs...), ActArgs&&... args)
 {
+  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
   cudaError_t status = [&](ExpArgs... args) {
     return detail::launch_impl(stream, kernel_config(dims), kernel, args...);
   }(std::forward<ActArgs>(args)...);
diff --git a/cudax/include/cuda/experimental/__stream/stream.cuh b/cudax/include/cuda/experimental/__stream/stream.cuh
index 4859e9fabc..0ba125269b 100644
--- a/cudax/include/cuda/experimental/__stream/stream.cuh
+++ b/cudax/include/cuda/experimental/__stream/stream.cuh
@@ -27,6 +27,7 @@
 
 #include <cuda/experimental/__device/device_ref.cuh>
 #include <cuda/experimental/__event/timed_event.cuh>
+#include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 namespace cuda::experimental
 {
@@ -51,7 +52,7 @@ struct stream : stream_ref
   //! @throws cuda_error if stream creation fails
   explicit stream(device_ref __dev, int __priority = default_priority)
   {
-    __scoped_device dev_setter(__dev);
+    [[maybe_unused]] __ensure_current_device __dev_setter(__dev);
     _CCCL_TRY_CUDA_API(
       ::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamDefault, __priority);
   }
@@ -89,7 +90,9 @@ struct stream : stream_ref
   {
     if (__stream != detail::invalid_stream)
     {
-      [[maybe_unused]] auto status = ::cudaStreamDestroy(__stream);
+      // Needs to call driver API in case current device is not set, runtime version would set dev 0 current
+      // Alternative would be to store the device and push/pop here
+      [[maybe_unused]] auto status = detail::driver::streamDestroy(__stream);
     }
   }
 
@@ -139,18 +142,20 @@ struct stream : stream_ref
   void wait(event_ref __ev) const
   {
     assert(__ev.get() != nullptr);
-    _CCCL_TRY_CUDA_API(::cudaStreamWaitEvent, "Failed to make a stream wait for an event", get(), __ev.get());
+    // Need to use driver API, cudaStreamWaitEvent would push dev 0 if stack was empty
+    detail::driver::streamWaitEvent(get(), __ev.get());
   }
 
-  //! @brief Make all future work submitted into this stream depend on completion of all work from the specified stream
+  //! @brief Make all future work submitted into this stream depend on completion of all work from the specified
+  //! stream
   //!
   //! @param __other Stream that this stream should wait for
   //!
   //! @throws cuda_error if inserting the dependency fails
   void wait(stream_ref __other) const
   {
-    // TODO consider an optimization to not create an event every time and instead have one persistent event or one per
-    // stream
+    // TODO consider an optimization to not create an event every time and instead have one persistent event or one
+    // per stream
     assert(__stream != detail::invalid_stream);
     event __tmp(__other);
     wait(__tmp);
diff --git a/cudax/include/cuda/experimental/__utility/driver_api.cuh b/cudax/include/cuda/experimental/__utility/driver_api.cuh
index 21b8c4d742..8a52dd89fc 100644
--- a/cudax/include/cuda/experimental/__utility/driver_api.cuh
+++ b/cudax/include/cuda/experimental/__utility/driver_api.cuh
@@ -25,7 +25,13 @@ inline void* get_driver_entry_point(const char* name)
 {
   void* fn;
   cudaDriverEntryPointQueryResult result;
+#if CUDART_VERSION >= 12050
+  // For minor version compatibility request the 12.0 version of everything for now
+  cudaGetDriverEntryPointByVersion(name, &fn, 12000, cudaEnableDefault, &result);
+#else
+  // Versioned get entry point not available before 12.5, but we don't need anything versioned before that
   cudaGetDriverEntryPoint(name, &fn, cudaEnableDefault, &result);
+#endif
   if (result != cudaDriverEntryPointSuccess)
   {
     if (result == cudaDriverEntryPointVersionNotSufficent)
@@ -56,11 +62,12 @@ inline void ctxPush(CUcontext ctx)
   call_driver_fn(driver_fn, "Failed to push context", ctx);
 }
 
-inline void ctxPop()
+inline CUcontext ctxPop()
 {
   static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuCtxPopCurrent);
-  CUcontext dummy;
-  call_driver_fn(driver_fn, "Failed to pop context", &dummy);
+  CUcontext result;
+  call_driver_fn(driver_fn, "Failed to pop context", &result);
+  return result;
 }
 
 inline CUcontext ctxGetCurrent()
@@ -71,6 +78,38 @@ inline CUcontext ctxGetCurrent()
   return result;
 }
 
+inline CUdevice deviceGet(int ordinal)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDeviceGet);
+  CUdevice result;
+  call_driver_fn(driver_fn, "Failed to get device", &result, ordinal);
+  return result;
+}
+
+inline CUcontext primaryCtxRetain(CUdevice dev)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRetain);
+  CUcontext result;
+  call_driver_fn(driver_fn, "Failed to retain context for a device", &result, dev);
+  return result;
+}
+
+inline void primaryCtxRelease(CUdevice dev)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRelease);
+  // TODO we might need to ignore failure here
+  call_driver_fn(driver_fn, "Failed to release context for a device", dev);
+}
+
+inline bool isPrimaryCtxActive(CUdevice dev)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxGetState);
+  int result;
+  unsigned int dummy;
+  call_driver_fn(driver_fn, "Failed to check the primary ctx state", dev, &dummy, &result);
+  return result == 1;
+}
+
 inline CUcontext streamGetCtx(CUstream stream)
 {
   static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuStreamGetCtx);
@@ -78,6 +117,31 @@ inline CUcontext streamGetCtx(CUstream stream)
   call_driver_fn(driver_fn, "Failed to get context from a stream", stream, &result);
   return result;
 }
+
+inline void streamWaitEvent(CUstream stream, CUevent event)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuStreamWaitEvent);
+  call_driver_fn(driver_fn, "Failed to make a stream wait for an event", stream, event, CU_EVENT_WAIT_DEFAULT);
+}
+
+inline void eventRecord(CUevent event, CUstream stream)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuEventRecord);
+  call_driver_fn(driver_fn, "Failed to record CUDA event", event, stream);
+}
+
+// Destroy calls return error codes to let the calling code decide if the error should be ignored
+inline cudaError_t streamDestroy(CUstream stream)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuStreamDestroy);
+  return static_cast<cudaError_t>(driver_fn(stream));
+}
+
+inline cudaError_t eventDestroy(CUevent event)
+{
+  static auto driver_fn = CUDAX_GET_DRIVER_FUNCTION(cuEventDestroy);
+  return static_cast<cudaError_t>(driver_fn(event));
+}
 } // namespace cuda::experimental::detail::driver
 
 #undef CUDAX_GET_DRIVER_FUNCTION
diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
new file mode 100644
index 0000000000..2431d02818
--- /dev/null
+++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE
+#define _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__device/all_devices.cuh>
+#include <cuda/experimental/__utility/driver_api.cuh>
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace cuda::experimental
+{
+//! @brief RAII helper which on construction sets the current device to the specified one or one a
+//! stream was created under. It sets the state back on destruction.
+//!
+struct __ensure_current_device
+{
+  //! @brief Construct a new `__ensure_current_device` object and switch to the specified
+  //!        device.
+  //!
+  //! @param new_device The device to switch to
+  //!
+  //! @throws cuda_error if the device switch fails
+  explicit __ensure_current_device(device_ref new_device)
+  {
+    auto ctx = devices[new_device.get()].primary_context();
+    detail::driver::ctxPush(ctx);
+  }
+
+  //! @brief Construct a new `__ensure_current_device` object and switch to the device
+  //!        under which the specified stream was created.
+  //!
+  //! @param stream Stream indicating the device to switch to
+  //!
+  //! @throws cuda_error if the device switch fails
+  explicit __ensure_current_device(stream_ref stream)
+  {
+    auto ctx = detail::driver::streamGetCtx(stream.get());
+    detail::driver::ctxPush(ctx);
+  }
+
+  __ensure_current_device(__ensure_current_device&&)                 = delete;
+  __ensure_current_device(__ensure_current_device const&)            = delete;
+  __ensure_current_device& operator=(__ensure_current_device&&)      = delete;
+  __ensure_current_device& operator=(__ensure_current_device const&) = delete;
+
+  //! @brief Destroy the `__ensure_current_device` object and switch back to the original
+  //!        device.
+  //!
+  //! @throws cuda_error if the device switch fails. If the destructor is called
+  //!         during stack unwinding, the program is automatically terminated.
+  ~__ensure_current_device() noexcept(false)
+  {
+    // TODO would it make sense to assert here that we pushed and popped the same thing?
+    detail::driver::ctxPop();
+  }
+};
+} // namespace cuda::experimental
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index bb8a7d7c54..4752f8b964 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -29,6 +29,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test
   target_link_libraries(${test_target} PRIVATE ${cn_target} Catch2::Catch2 catch2_main)
   target_link_libraries(${test_target} PRIVATE ${cn_target} cudax::Thrust)
   target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE")
+  target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXCEPTIONS")
   target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
   cudax_clone_target_properties(${test_target} ${cn_target})
   set_target_properties(${test_target} PROPERTIES
@@ -80,6 +81,7 @@ foreach(cn_target IN LISTS cudax_TARGETS)
 
   cudax_add_catch2_test(test_target misc_tests ${cn_target}
     utility/driver_api.cu
+    utility/ensure_current_device.cu
   )
 
   cudax_add_catch2_test(test_target containers ${cn_target}
diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh
index 2d7254c069..64a54e1b48 100644
--- a/cudax/test/common/utility.cuh
+++ b/cudax/test/common/utility.cuh
@@ -137,6 +137,11 @@ struct spin_until_80
   }
 };
 
+struct empty_kernel
+{
+  __device__ void operator()() const noexcept {}
+};
+
 /// A kernel that takes a callable object and invokes it with a set of arguments
 template <class Fn, class... Args>
 __global__ void invokernel(Fn fn, Args... args)
@@ -144,5 +149,28 @@ __global__ void invokernel(Fn fn, Args... args)
   fn(args...);
 }
 
+inline int count_driver_stack()
+{
+  if (cudax::detail::driver::ctxGetCurrent() != nullptr)
+  {
+    auto ctx    = cudax::detail::driver::ctxPop();
+    auto result = 1 + count_driver_stack();
+    cudax::detail::driver::ctxPush(ctx);
+    return result;
+  }
+  else
+  {
+    return 0;
+  }
+}
+
+inline void empty_driver_stack()
+{
+  while (cudax::detail::driver::ctxGetCurrent() != nullptr)
+  {
+    cudax::detail::driver::ctxPop();
+  }
+}
+
 } // namespace test
 } // namespace
diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu
index 86c9625e21..6f772de08a 100644
--- a/cudax/test/device/device_smoke.cu
+++ b/cudax/test/device/device_smoke.cu
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define LIBCUDACXX_ENABLE_EXCEPTIONS
 #include <cuda/experimental/device.cuh>
 
 #include "../hierarchy/testing_common.cuh"
@@ -260,9 +259,9 @@ TEST_CASE("global devices vector", "[device]")
     CUDAX_REQUIRE(1 == std::next(cudax::devices.begin())->get());
     CUDAX_REQUIRE(1 == cudax::devices.begin()[1].get());
 
-    CUDAX_REQUIRE(0 == (*std::prev(cudax::devices.end())).get());
-    CUDAX_REQUIRE(0 == std::prev(cudax::devices.end())->get());
-    CUDAX_REQUIRE(0 == cudax::devices.end()[-1].get());
+    CUDAX_REQUIRE(cudax::devices.size() - 1 == (*std::prev(cudax::devices.end())).get());
+    CUDAX_REQUIRE(cudax::devices.size() - 1 == std::prev(cudax::devices.end())->get());
+    CUDAX_REQUIRE(cudax::devices.size() - 1 == cudax::devices.end()[-1].get());
   }
 
   try
diff --git a/cudax/test/launch/configuration.cu b/cudax/test/launch/configuration.cu
index a47eea2590..9e7f98df1b 100644
--- a/cudax/test/launch/configuration.cu
+++ b/cudax/test/launch/configuration.cu
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define LIBCUDACXX_ENABLE_EXCEPTIONS
 // Test translation of launch function arguments to cudaLaunchConfig_t sent to cudaLaunchKernelEx internally
 // We replace cudaLaunchKernelEx with a test function here through a macro to intercept the cudaLaunchConfig_t
 #define cudaLaunchKernelEx cudaLaunchKernelExTestReplacement
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index 554cabd015..810e65c390 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -7,7 +7,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
-#define LIBCUDACXX_ENABLE_EXCEPTIONS
 #include <cuda/atomic>
 
 #include <cuda/experimental/launch.cuh>
diff --git a/cudax/test/stream/get_stream.cu b/cudax/test/stream/get_stream.cu
index 277a10246a..0654c3be39 100644
--- a/cudax/test/stream/get_stream.cu
+++ b/cudax/test/stream/get_stream.cu
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define LIBCUDACXX_ENABLE_EXCEPTIONS
 #include <cuda/experimental/stream.cuh>
 
 #include "../common/utility.cuh"
diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu
index e6b86ccf16..cbee352080 100644
--- a/cudax/test/stream/stream_smoke.cu
+++ b/cudax/test/stream/stream_smoke.cu
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define LIBCUDACXX_ENABLE_EXCEPTIONS
 #include <cuda/experimental/launch.cuh>
 #include <cuda/experimental/stream.cuh>
 
diff --git a/cudax/test/utility/driver_api.cu b/cudax/test/utility/driver_api.cu
index 513d6476eb..e5fd64d14f 100644
--- a/cudax/test/utility/driver_api.cu
+++ b/cudax/test/utility/driver_api.cu
@@ -7,14 +7,14 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
-#define LIBCUDACXX_ENABLE_EXCEPTIONS
 
 #include <cuda/experimental/__utility/driver_api.cuh>
 
 #include "../hierarchy/testing_common.cuh"
 
-TEST_CASE("Call each one", "[driver api]")
+TEST_CASE("Call each driver api", "[utility]")
 {
+  namespace driver = cuda::experimental::detail::driver;
   cudaStream_t stream;
   // Assumes the ctx stack was empty or had one ctx, should be the case unless some other
   // test leaves 2+ ctxs on the stack
@@ -22,23 +22,48 @@ TEST_CASE("Call each one", "[driver api]")
   // Pushes the primary context if the stack is empty
   CUDART(cudaStreamCreate(&stream));
 
-  auto ctx = cuda::experimental::detail::driver::ctxGetCurrent();
+  auto ctx = driver::ctxGetCurrent();
   CUDAX_REQUIRE(ctx != nullptr);
 
-  cuda::experimental::detail::driver::ctxPop();
-  CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == nullptr);
+  // Confirm pop will leave the stack empty
+  driver::ctxPop();
+  CUDAX_REQUIRE(driver::ctxGetCurrent() == nullptr);
 
-  cuda::experimental::detail::driver::ctxPush(ctx);
-  CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == ctx);
+  // Confirm we can push multiple times
+  driver::ctxPush(ctx);
+  CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx);
 
-  cuda::experimental::detail::driver::ctxPush(ctx);
-  CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == ctx);
+  driver::ctxPush(ctx);
+  CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx);
 
-  cuda::experimental::detail::driver::ctxPop();
-  CUDAX_REQUIRE(cuda::experimental::detail::driver::ctxGetCurrent() == ctx);
+  driver::ctxPop();
+  CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx);
 
-  auto stream_ctx = cuda::experimental::detail::driver::streamGetCtx(stream);
+  // Confirm stream ctx match
+  auto stream_ctx = driver::streamGetCtx(stream);
   CUDAX_REQUIRE(ctx == stream_ctx);
 
   CUDART(cudaStreamDestroy(stream));
+
+  CUDAX_REQUIRE(driver::deviceGet(0) == 0);
+
+  // Confirm we can retain the primary ctx that cudart retained first
+  auto primary_ctx = driver::primaryCtxRetain(0);
+  CUDAX_REQUIRE(ctx == primary_ctx);
+
+  driver::ctxPop();
+  CUDAX_REQUIRE(driver::ctxGetCurrent() == nullptr);
+
+  CUDAX_REQUIRE(driver::isPrimaryCtxActive(0));
+  // Confirm we can reset the primary context with double release
+  driver::primaryCtxRelease(0);
+  driver::primaryCtxRelease(0);
+
+  CUDAX_REQUIRE(!driver::isPrimaryCtxActive(0));
+
+  // Confirm cudart can recover
+  CUDART(cudaStreamCreate(&stream));
+  CUDAX_REQUIRE(driver::ctxGetCurrent() == ctx);
+
+  CUDART(driver::streamDestroy(stream));
 }
diff --git a/cudax/test/utility/ensure_current_device.cu b/cudax/test/utility/ensure_current_device.cu
new file mode 100644
index 0000000000..89efc7d4f6
--- /dev/null
+++ b/cudax/test/utility/ensure_current_device.cu
@@ -0,0 +1,135 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/__stream/stream.cuh>
+#include <cuda/experimental/__utility/ensure_current_device.cuh>
+#include <cuda/experimental/event.cuh>
+#include <cuda/experimental/launch.cuh>
+
+#include "../common/utility.cuh"
+
+namespace driver = cuda::experimental::detail::driver;
+
+void recursive_check_device_setter(int id)
+{
+  int cudart_id;
+  cudax::__ensure_current_device setter(cudax::device_ref{id});
+  CUDAX_REQUIRE(test::count_driver_stack() == cudax::devices.size() - id);
+  auto ctx = driver::ctxGetCurrent();
+  CUDART(cudaGetDevice(&cudart_id));
+  CUDAX_REQUIRE(cudart_id == id);
+
+  if (id != 0)
+  {
+    recursive_check_device_setter(id - 1);
+
+    CUDAX_REQUIRE(test::count_driver_stack() == cudax::devices.size() - id);
+    CUDAX_REQUIRE(ctx == driver::ctxGetCurrent());
+    CUDART(cudaGetDevice(&cudart_id));
+    CUDAX_REQUIRE(cudart_id == id);
+  }
+}
+
+TEST_CASE("ensure current device", "[device]")
+{
+  test::empty_driver_stack();
+  // If possible use something different than CUDART default 0
+  int target_device = static_cast<int>(cudax::devices.size() - 1);
+  int dev_id        = 0;
+
+  SECTION("device setter")
+  {
+    recursive_check_device_setter(target_device);
+
+    CUDAX_REQUIRE(test::count_driver_stack() == 0);
+  }
+
+  SECTION("stream interactions with driver stack")
+  {
+    {
+      cudax::stream stream(target_device);
+      CUDAX_REQUIRE(test::count_driver_stack() == 0);
+      {
+        cudax::__ensure_current_device setter(cudax::device_ref{target_device});
+        CUDAX_REQUIRE(driver::ctxGetCurrent() == driver::streamGetCtx(stream.get()));
+      }
+      {
+        auto ev = stream.record_event();
+        CUDAX_REQUIRE(test::count_driver_stack() == 0);
+      }
+      CUDAX_REQUIRE(test::count_driver_stack() == 0);
+      {
+        auto ev = stream.record_timed_event();
+        CUDAX_REQUIRE(test::count_driver_stack() == 0);
+      }
+      {
+        auto lambda = [&](int dev_id) {
+          cudax::stream another_stream(dev_id);
+          CUDAX_REQUIRE(test::count_driver_stack() == 0);
+          stream.wait(another_stream);
+          CUDAX_REQUIRE(test::count_driver_stack() == 0);
+          another_stream.wait(stream);
+          CUDAX_REQUIRE(test::count_driver_stack() == 0);
+        };
+        lambda(target_device);
+        if (cudax::devices.size() > 1)
+        {
+          lambda(0);
+        }
+      }
+
+      cudax::__ensure_current_device setter(stream);
+      CUDAX_REQUIRE(test::count_driver_stack() == 1);
+      CUDART(cudaGetDevice(&dev_id));
+      CUDAX_REQUIRE(dev_id == target_device);
+      CUDAX_REQUIRE(driver::ctxGetCurrent() == driver::streamGetCtx(stream.get()));
+    }
+
+    CHECK(test::count_driver_stack() == 0);
+
+    {
+      // Check NULL stream ref is handled ok
+      cudax::__ensure_current_device setter1(cudax::device_ref{target_device});
+      cudaStream_t null_stream = nullptr;
+      auto ref                 = cuda::stream_ref(null_stream);
+      auto ctx                 = driver::ctxGetCurrent();
+      CUDAX_REQUIRE(test::count_driver_stack() == 1);
+
+      cudax::__ensure_current_device setter2(ref);
+      CUDAX_REQUIRE(test::count_driver_stack() == 2);
+      CUDAX_REQUIRE(ctx == driver::ctxGetCurrent());
+      CUDART(cudaGetDevice(&dev_id));
+      CUDAX_REQUIRE(dev_id == target_device);
+    }
+  }
+
+  SECTION("event interactions with driver stack")
+  {
+    {
+      cudax::stream stream(target_device);
+      CUDAX_REQUIRE(test::count_driver_stack() == 0);
+
+      cudax::event event(stream);
+      CUDAX_REQUIRE(test::count_driver_stack() == 0);
+
+      event.record(stream);
+      CUDAX_REQUIRE(test::count_driver_stack() == 0);
+    }
+    CUDAX_REQUIRE(test::count_driver_stack() == 0);
+  }
+
+  SECTION("launch interactions with driver stack")
+  {
+    cudax::stream stream(target_device);
+    CUDAX_REQUIRE(test::count_driver_stack() == 0);
+    cudax::launch(stream, cudax::make_hierarchy(cudax::block_dims<1>(), cudax::grid_dims<1>()), test::empty_kernel{});
+    CUDAX_REQUIRE(test::count_driver_stack() == 0);
+  }
+}

From 1b6dbd40509f96e16e9f34749bbbf9068f9ea9e1 Mon Sep 17 00:00:00 2001
From: Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
Date: Tue, 6 Aug 2024 12:56:41 -0400
Subject: [PATCH 04/33] Fix singular vs plural typo in thread scope
 documentation. (#2198)

* Fix singular vs plural typo in thread scope documentation.

* Better grammar fix.
---
 docs/libcudacxx/extended_api/memory_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/libcudacxx/extended_api/memory_model.rst b/docs/libcudacxx/extended_api/memory_model.rst
index ff9f9ef44c..dfb6ed6789 100644
--- a/docs/libcudacxx/extended_api/memory_model.rst
+++ b/docs/libcudacxx/extended_api/memory_model.rst
@@ -17,7 +17,7 @@ semantics of standard C++ by default.
 Thread Scopes
 -------------
 
-A **thread scope** specifies the kind of threads that can synchronize with each other using synchronization primitive such
+A **thread scope** specifies the kind of threads that can synchronize with each other using a synchronization primitive such
 as :ref:`atomic <libcudacxx-extended-api-synchronization-atomic>` or
 :ref:`barrier <libcudacxx-extended-api-synchronization-barrier>`.
 

From 2db4fa7232e3250bdd0539a2afee4f1d32a7ab30 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Tue, 6 Aug 2024 11:13:17 -0700
Subject: [PATCH 05/33] [CUDAX] fixing some minor issues with device attribute
 queries (#2183)

* [cudax] give the `cudaDevAttrMemoryPoolSupportedHandleTypes` attribute the correct type

* move attribute definitions from `device_ref` to `device`
---
 .../cuda/experimental/__device/attributes.cuh | 414 ++++++++++--------
 .../cuda/experimental/__device/device.cuh     |  17 +
 .../cuda/experimental/__device/device_ref.cuh |  48 +-
 cudax/test/device/device_smoke.cu             |  26 +-
 4 files changed, 274 insertions(+), 231 deletions(-)

diff --git a/cudax/include/cuda/experimental/__device/attributes.cuh b/cudax/include/cuda/experimental/__device/attributes.cuh
index 1c02cc19c9..5a873f6ebb 100644
--- a/cudax/include/cuda/experimental/__device/attributes.cuh
+++ b/cudax/include/cuda/experimental/__device/attributes.cuh
@@ -24,668 +24,694 @@
 #include <cuda/std/__cccl/attributes.h>
 #include <cuda/std/__cuda/api_wrapper.h>
 
-#include <cuda/experimental/__device/device_ref.cuh>
+#include <cuda/experimental/__device/device.cuh>
 
 namespace cuda::experimental
 {
 
 namespace detail
 {
+template <::cudaDeviceAttr _Attr>
+struct __dev_attr
+{
+  using type = int;
+
+  _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept
+  {
+    return _Attr;
+  }
+
+  _CCCL_NODISCARD type operator()(device_ref __dev_id) const
+  {
+    return __dev_id.attr<_Attr>();
+  }
+};
+
 template <::cudaDeviceAttr _Attr, typename _Type>
-struct __attr_with_type
+struct __dev_attr_with_type
 {
   using type = _Type;
 
-  constexpr operator ::cudaDeviceAttr() const noexcept
+  _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept
   {
     return _Attr;
   }
 
-  _CCCL_NODISCARD type operator()(device_ref __dev) const
+  _CCCL_NODISCARD type operator()(device_ref __dev_id) const
   {
-    return __dev.attr<_Attr>();
+    return __dev_id.attr<_Attr>();
   }
 };
-} // namespace detail
 
 // TODO: give this a strong type for kilohertz
 template <>
-struct device_ref::__attr<::cudaDevAttrClockRate> //
-    : detail::__attr_with_type<::cudaDevAttrClockRate, int>
+struct __dev_attr<::cudaDevAttrClockRate> //
+    : __dev_attr_with_type<::cudaDevAttrClockRate, int>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrGpuOverlap> //
-    : detail::__attr_with_type<::cudaDevAttrGpuOverlap, bool>
+struct __dev_attr<::cudaDevAttrGpuOverlap> //
+    : __dev_attr_with_type<::cudaDevAttrGpuOverlap, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrKernelExecTimeout> //
-    : detail::__attr_with_type<::cudaDevAttrKernelExecTimeout, bool>
+struct __dev_attr<::cudaDevAttrKernelExecTimeout> //
+    : __dev_attr_with_type<::cudaDevAttrKernelExecTimeout, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrIntegrated> //
-    : detail::__attr_with_type<::cudaDevAttrIntegrated, bool>
+struct __dev_attr<::cudaDevAttrIntegrated> //
+    : __dev_attr_with_type<::cudaDevAttrIntegrated, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrCanMapHostMemory> //
-    : detail::__attr_with_type<::cudaDevAttrCanMapHostMemory, bool>
+struct __dev_attr<::cudaDevAttrCanMapHostMemory> //
+    : __dev_attr_with_type<::cudaDevAttrCanMapHostMemory, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrComputeMode> //
-    : detail::__attr_with_type<::cudaDevAttrComputeMode, ::cudaComputeMode>
+struct __dev_attr<::cudaDevAttrComputeMode> //
+    : __dev_attr_with_type<::cudaDevAttrComputeMode, ::cudaComputeMode>
 {
   static constexpr type default_mode           = cudaComputeModeDefault;
   static constexpr type prohibited_mode        = cudaComputeModeProhibited;
   static constexpr type exclusive_process_mode = cudaComputeModeExclusiveProcess;
 };
 template <>
-struct device_ref::__attr<::cudaDevAttrConcurrentKernels> //
-    : detail::__attr_with_type<::cudaDevAttrConcurrentKernels, bool>
+struct __dev_attr<::cudaDevAttrConcurrentKernels> //
+    : __dev_attr_with_type<::cudaDevAttrConcurrentKernels, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrEccEnabled> //
-    : detail::__attr_with_type<::cudaDevAttrEccEnabled, bool>
+struct __dev_attr<::cudaDevAttrEccEnabled> //
+    : __dev_attr_with_type<::cudaDevAttrEccEnabled, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrTccDriver> //
-    : detail::__attr_with_type<::cudaDevAttrTccDriver, bool>
+struct __dev_attr<::cudaDevAttrTccDriver> //
+    : __dev_attr_with_type<::cudaDevAttrTccDriver, bool>
 {};
 // TODO: give this a strong type for kilohertz
 template <>
-struct device_ref::__attr<::cudaDevAttrMemoryClockRate> //
-    : detail::__attr_with_type<::cudaDevAttrMemoryClockRate, int>
+struct __dev_attr<::cudaDevAttrMemoryClockRate> //
+    : __dev_attr_with_type<::cudaDevAttrMemoryClockRate, int>
 {};
 // TODO: give this a strong type for bits
 template <>
-struct device_ref::__attr<::cudaDevAttrGlobalMemoryBusWidth> //
-    : detail::__attr_with_type<::cudaDevAttrGlobalMemoryBusWidth, int>
+struct __dev_attr<::cudaDevAttrGlobalMemoryBusWidth> //
+    : __dev_attr_with_type<::cudaDevAttrGlobalMemoryBusWidth, int>
 {};
 // TODO: give this a strong type for bytes
 template <>
-struct device_ref::__attr<::cudaDevAttrL2CacheSize> //
-    : detail::__attr_with_type<::cudaDevAttrL2CacheSize, int>
+struct __dev_attr<::cudaDevAttrL2CacheSize> //
+    : __dev_attr_with_type<::cudaDevAttrL2CacheSize, int>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrUnifiedAddressing> //
-    : detail::__attr_with_type<::cudaDevAttrUnifiedAddressing, bool>
+struct __dev_attr<::cudaDevAttrUnifiedAddressing> //
+    : __dev_attr_with_type<::cudaDevAttrUnifiedAddressing, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrStreamPrioritiesSupported> //
-    : detail::__attr_with_type<::cudaDevAttrStreamPrioritiesSupported, bool>
+struct __dev_attr<::cudaDevAttrStreamPrioritiesSupported> //
+    : __dev_attr_with_type<::cudaDevAttrStreamPrioritiesSupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrGlobalL1CacheSupported> //
-    : detail::__attr_with_type<::cudaDevAttrGlobalL1CacheSupported, bool>
+struct __dev_attr<::cudaDevAttrGlobalL1CacheSupported> //
+    : __dev_attr_with_type<::cudaDevAttrGlobalL1CacheSupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrLocalL1CacheSupported> //
-    : detail::__attr_with_type<::cudaDevAttrLocalL1CacheSupported, bool>
+struct __dev_attr<::cudaDevAttrLocalL1CacheSupported> //
+    : __dev_attr_with_type<::cudaDevAttrLocalL1CacheSupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrManagedMemory> //
-    : detail::__attr_with_type<::cudaDevAttrManagedMemory, bool>
+struct __dev_attr<::cudaDevAttrManagedMemory> //
+    : __dev_attr_with_type<::cudaDevAttrManagedMemory, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrIsMultiGpuBoard> //
-    : detail::__attr_with_type<::cudaDevAttrIsMultiGpuBoard, bool>
+struct __dev_attr<::cudaDevAttrIsMultiGpuBoard> //
+    : __dev_attr_with_type<::cudaDevAttrIsMultiGpuBoard, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrHostNativeAtomicSupported> //
-    : detail::__attr_with_type<::cudaDevAttrHostNativeAtomicSupported, bool>
+struct __dev_attr<::cudaDevAttrHostNativeAtomicSupported> //
+    : __dev_attr_with_type<::cudaDevAttrHostNativeAtomicSupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrPageableMemoryAccess> //
-    : detail::__attr_with_type<::cudaDevAttrPageableMemoryAccess, bool>
+struct __dev_attr<::cudaDevAttrPageableMemoryAccess> //
+    : __dev_attr_with_type<::cudaDevAttrPageableMemoryAccess, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrConcurrentManagedAccess> //
-    : detail::__attr_with_type<::cudaDevAttrConcurrentManagedAccess, bool>
+struct __dev_attr<::cudaDevAttrConcurrentManagedAccess> //
+    : __dev_attr_with_type<::cudaDevAttrConcurrentManagedAccess, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrComputePreemptionSupported> //
-    : detail::__attr_with_type<::cudaDevAttrComputePreemptionSupported, bool>
+struct __dev_attr<::cudaDevAttrComputePreemptionSupported> //
+    : __dev_attr_with_type<::cudaDevAttrComputePreemptionSupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrCanUseHostPointerForRegisteredMem> //
-    : detail::__attr_with_type<::cudaDevAttrCanUseHostPointerForRegisteredMem, bool>
+struct __dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem> //
+    : __dev_attr_with_type<::cudaDevAttrCanUseHostPointerForRegisteredMem, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrCooperativeLaunch> //
-    : detail::__attr_with_type<::cudaDevAttrCooperativeLaunch, bool>
+struct __dev_attr<::cudaDevAttrCooperativeLaunch> //
+    : __dev_attr_with_type<::cudaDevAttrCooperativeLaunch, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrCooperativeMultiDeviceLaunch> //
-    : detail::__attr_with_type<::cudaDevAttrCooperativeMultiDeviceLaunch, bool>
+struct __dev_attr<::cudaDevAttrCooperativeMultiDeviceLaunch> //
+    : __dev_attr_with_type<::cudaDevAttrCooperativeMultiDeviceLaunch, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrCanFlushRemoteWrites> //
-    : detail::__attr_with_type<::cudaDevAttrCanFlushRemoteWrites, bool>
+struct __dev_attr<::cudaDevAttrCanFlushRemoteWrites> //
+    : __dev_attr_with_type<::cudaDevAttrCanFlushRemoteWrites, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrHostRegisterSupported> //
-    : detail::__attr_with_type<::cudaDevAttrHostRegisterSupported, bool>
+struct __dev_attr<::cudaDevAttrHostRegisterSupported> //
+    : __dev_attr_with_type<::cudaDevAttrHostRegisterSupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrDirectManagedMemAccessFromHost> //
-    : detail::__attr_with_type<::cudaDevAttrDirectManagedMemAccessFromHost, bool>
+struct __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost> //
+    : __dev_attr_with_type<::cudaDevAttrDirectManagedMemAccessFromHost, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrSparseCudaArraySupported> //
-    : detail::__attr_with_type<::cudaDevAttrSparseCudaArraySupported, bool>
+struct __dev_attr<::cudaDevAttrSparseCudaArraySupported> //
+    : __dev_attr_with_type<::cudaDevAttrSparseCudaArraySupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrMemoryPoolsSupported> //
-    : detail::__attr_with_type<::cudaDevAttrMemoryPoolsSupported, bool>
+struct __dev_attr<::cudaDevAttrMemoryPoolsSupported> //
+    : __dev_attr_with_type<::cudaDevAttrMemoryPoolsSupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrGPUDirectRDMASupported> //
-    : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMASupported, bool>
+struct __dev_attr<::cudaDevAttrGPUDirectRDMASupported> //
+    : __dev_attr_with_type<::cudaDevAttrGPUDirectRDMASupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrDeferredMappingCudaArraySupported> //
-    : detail::__attr_with_type<::cudaDevAttrDeferredMappingCudaArraySupported, bool>
+struct __dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported> //
+    : __dev_attr_with_type<::cudaDevAttrDeferredMappingCudaArraySupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrIpcEventSupport> //
-    : detail::__attr_with_type<::cudaDevAttrIpcEventSupport, bool>
+struct __dev_attr<::cudaDevAttrIpcEventSupport> //
+    : __dev_attr_with_type<::cudaDevAttrIpcEventSupport, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>
-    : detail::__attr_with_type<::cudaDevAttrPageableMemoryAccessUsesHostPageTables, bool>
+struct __dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>
+    : __dev_attr_with_type<::cudaDevAttrPageableMemoryAccessUsesHostPageTables, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrHostRegisterReadOnlySupported> //
-    : detail::__attr_with_type<::cudaDevAttrHostRegisterReadOnlySupported, bool>
+struct __dev_attr<::cudaDevAttrHostRegisterReadOnlySupported> //
+    : __dev_attr_with_type<::cudaDevAttrHostRegisterReadOnlySupported, bool>
 {};
 template <>
-struct device_ref::__attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions> //
-    : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMAFlushWritesOptions, ::cudaFlushGPUDirectRDMAWritesOptions>
+struct __dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions> //
+    : __dev_attr_with_type<::cudaDevAttrGPUDirectRDMAFlushWritesOptions, ::cudaFlushGPUDirectRDMAWritesOptions>
 {
   static constexpr type host    = ::cudaFlushGPUDirectRDMAWritesOptionHost;
   static constexpr type mem_ops = ::cudaFlushGPUDirectRDMAWritesOptionMemOps;
 };
 template <>
-struct device_ref::__attr<::cudaDevAttrGPUDirectRDMAWritesOrdering> //
-    : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMAWritesOrdering, ::cudaGPUDirectRDMAWritesOrdering>
+struct __dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering> //
+    : __dev_attr_with_type<::cudaDevAttrGPUDirectRDMAWritesOrdering, ::cudaGPUDirectRDMAWritesOrdering>
 {
   static constexpr type none        = ::cudaGPUDirectRDMAWritesOrderingNone;
   static constexpr type owner       = ::cudaGPUDirectRDMAWritesOrderingOwner;
   static constexpr type all_devices = ::cudaGPUDirectRDMAWritesOrderingAllDevices;
 };
-// TODO: This is a bitmask. What are the possible values?
 template <>
-struct device_ref::__attr<::cudaDevAttrMemoryPoolSupportedHandleTypes> //
-    : detail::__attr_with_type<::cudaDevAttrMemoryPoolSupportedHandleTypes, unsigned int>
-{};
+struct __dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes> //
+    : __dev_attr_with_type<::cudaDevAttrMemoryPoolSupportedHandleTypes, ::cudaMemAllocationHandleType>
+{
+  static constexpr type none                  = ::cudaMemHandleTypeNone;
+  static constexpr type posix_file_descriptor = ::cudaMemHandleTypePosixFileDescriptor;
+  static constexpr type win32                 = ::cudaMemHandleTypeWin32;
+  static constexpr type win32_kmt             = ::cudaMemHandleTypeWin32Kmt;
+#if CUDART_VERSION >= 12040
+  static constexpr type fabric = ::cudaMemHandleTypeFabric;
+#else
+  static constexpr type fabric = static_cast<::cudaMemAllocationHandleType>(0x8);
+#endif
+};
 #if CUDART_VERSION >= 12020
 template <>
-struct device_ref::__attr<::cudaDevAttrNumaConfig> //
-    : detail::__attr_with_type<::cudaDevAttrNumaConfig, ::cudaDeviceNumaConfig>
+struct __dev_attr<::cudaDevAttrNumaConfig> //
+    : __dev_attr_with_type<::cudaDevAttrNumaConfig, ::cudaDeviceNumaConfig>
 {
   static constexpr type none      = ::cudaDeviceNumaConfigNone;
   static constexpr type numa_node = ::cudaDeviceNumaConfigNumaNode;
 };
 #endif
+} // namespace detail
 
-struct device_ref::attrs
+struct device::attrs
 {
   // Maximum number of threads per block
-  using max_threads_per_block_t = __attr<::cudaDevAttrMaxThreadsPerBlock>;
+  using max_threads_per_block_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
   static constexpr max_threads_per_block_t max_threads_per_block{};
 
   // Maximum x-dimension of a block
-  using max_block_dim_x_t = __attr<::cudaDevAttrMaxBlockDimX>;
+  using max_block_dim_x_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimX>;
   static constexpr max_block_dim_x_t max_block_dim_x{};
 
   // Maximum y-dimension of a block
-  using max_block_dim_y_t = __attr<::cudaDevAttrMaxBlockDimY>;
+  using max_block_dim_y_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimY>;
   static constexpr max_block_dim_y_t max_block_dim_y{};
 
   // Maximum z-dimension of a block
-  using max_block_dim_z_t = __attr<::cudaDevAttrMaxBlockDimZ>;
+  using max_block_dim_z_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimZ>;
   static constexpr max_block_dim_z_t max_block_dim_z{};
 
   // Maximum x-dimension of a grid
-  using max_grid_dim_x_t = __attr<::cudaDevAttrMaxGridDimX>;
+  using max_grid_dim_x_t = detail::__dev_attr<::cudaDevAttrMaxGridDimX>;
   static constexpr max_grid_dim_x_t max_grid_dim_x{};
 
   // Maximum y-dimension of a grid
-  using max_grid_dim_y_t = __attr<::cudaDevAttrMaxGridDimY>;
+  using max_grid_dim_y_t = detail::__dev_attr<::cudaDevAttrMaxGridDimY>;
   static constexpr max_grid_dim_y_t max_grid_dim_y{};
 
   // Maximum z-dimension of a grid
-  using max_grid_dim_z_t = __attr<::cudaDevAttrMaxGridDimZ>;
+  using max_grid_dim_z_t = detail::__dev_attr<::cudaDevAttrMaxGridDimZ>;
   static constexpr max_grid_dim_z_t max_grid_dim_z{};
 
   // Maximum amount of shared memory available to a thread block in bytes
-  using max_shared_memory_per_block_t = __attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
+  using max_shared_memory_per_block_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
   static constexpr max_shared_memory_per_block_t max_shared_memory_per_block{};
 
   // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
-  using total_constant_memory_t = __attr<::cudaDevAttrTotalConstantMemory>;
+  using total_constant_memory_t = detail::__dev_attr<::cudaDevAttrTotalConstantMemory>;
   static constexpr total_constant_memory_t total_constant_memory{};
 
   // Warp size in threads
-  using warp_size_t = __attr<::cudaDevAttrWarpSize>;
+  using warp_size_t = detail::__dev_attr<::cudaDevAttrWarpSize>;
   static constexpr warp_size_t warp_size{};
 
   // Maximum pitch in bytes allowed by the memory copy functions that involve
   // memory regions allocated through cudaMallocPitch()
-  using max_pitch_t = __attr<::cudaDevAttrMaxPitch>;
+  using max_pitch_t = detail::__dev_attr<::cudaDevAttrMaxPitch>;
   static constexpr max_pitch_t max_pitch{};
 
   // Maximum 1D texture width
-  using max_texture_1d_width_t = __attr<::cudaDevAttrMaxTexture1DWidth>;
+  using max_texture_1d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DWidth>;
   static constexpr max_texture_1d_width_t max_texture_1d_width{};
 
   // Maximum width for a 1D texture bound to linear memory
-  using max_texture_1d_linear_width_t = __attr<::cudaDevAttrMaxTexture1DLinearWidth>;
+  using max_texture_1d_linear_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>;
   static constexpr max_texture_1d_linear_width_t max_texture_1d_linear_width{};
 
   // Maximum mipmapped 1D texture width
-  using max_texture_1d_mipmapped_width_t = __attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
+  using max_texture_1d_mipmapped_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
   static constexpr max_texture_1d_mipmapped_width_t max_texture_1d_mipmapped_width{};
 
   // Maximum 2D texture width
-  using max_texture_2d_width_t = __attr<::cudaDevAttrMaxTexture2DWidth>;
+  using max_texture_2d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DWidth>;
   static constexpr max_texture_2d_width_t max_texture_2d_width{};
 
   // Maximum 2D texture height
-  using max_texture_2d_height_t = __attr<::cudaDevAttrMaxTexture2DHeight>;
+  using max_texture_2d_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DHeight>;
   static constexpr max_texture_2d_height_t max_texture_2d_height{};
 
   // Maximum width for a 2D texture bound to linear memory
-  using max_texture_2d_linear_width_t = __attr<::cudaDevAttrMaxTexture2DLinearWidth>;
+  using max_texture_2d_linear_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>;
   static constexpr max_texture_2d_linear_width_t max_texture_2d_linear_width{};
 
   // Maximum height for a 2D texture bound to linear memory
-  using max_texture_2d_linear_height_t = __attr<::cudaDevAttrMaxTexture2DLinearHeight>;
+  using max_texture_2d_linear_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>;
   static constexpr max_texture_2d_linear_height_t max_texture_2d_linear_height{};
 
   // Maximum pitch in bytes for a 2D texture bound to linear memory
-  using max_texture_2d_linear_pitch_t = __attr<::cudaDevAttrMaxTexture2DLinearPitch>;
+  using max_texture_2d_linear_pitch_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>;
   static constexpr max_texture_2d_linear_pitch_t max_texture_2d_linear_pitch{};
 
   // Maximum mipmapped 2D texture width
-  using max_texture_2d_mipmapped_width_t = __attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
+  using max_texture_2d_mipmapped_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
   static constexpr max_texture_2d_mipmapped_width_t max_texture_2d_mipmapped_width{};
 
   // Maximum mipmapped 2D texture height
-  using max_texture_2d_mipmapped_height_t = __attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
+  using max_texture_2d_mipmapped_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
   static constexpr max_texture_2d_mipmapped_height_t max_texture_2d_mipmapped_height{};
 
   // Maximum 3D texture width
-  using max_texture_3d_width_t = __attr<::cudaDevAttrMaxTexture3DWidth>;
+  using max_texture_3d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DWidth>;
   static constexpr max_texture_3d_width_t max_texture_3d_width{};
 
   // Maximum 3D texture height
-  using max_texture_3d_height_t = __attr<::cudaDevAttrMaxTexture3DHeight>;
+  using max_texture_3d_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DHeight>;
   static constexpr max_texture_3d_height_t max_texture_3d_height{};
 
   // Maximum 3D texture depth
-  using max_texture_3d_depth_t = __attr<::cudaDevAttrMaxTexture3DDepth>;
+  using max_texture_3d_depth_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DDepth>;
   static constexpr max_texture_3d_depth_t max_texture_3d_depth{};
 
   // Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported
-  using max_texture_3d_width_alt_t = __attr<::cudaDevAttrMaxTexture3DWidthAlt>;
+  using max_texture_3d_width_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>;
   static constexpr max_texture_3d_width_alt_t max_texture_3d_width_alt{};
 
   // Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported
-  using max_texture_3d_height_alt_t = __attr<::cudaDevAttrMaxTexture3DHeightAlt>;
+  using max_texture_3d_height_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>;
   static constexpr max_texture_3d_height_alt_t max_texture_3d_height_alt{};
 
   // Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported
-  using max_texture_3d_depth_alt_t = __attr<::cudaDevAttrMaxTexture3DDepthAlt>;
+  using max_texture_3d_depth_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>;
   static constexpr max_texture_3d_depth_alt_t max_texture_3d_depth_alt{};
 
   // Maximum cubemap texture width or height
-  using max_texture_cubemap_width_t = __attr<::cudaDevAttrMaxTextureCubemapWidth>;
+  using max_texture_cubemap_width_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapWidth>;
   static constexpr max_texture_cubemap_width_t max_texture_cubemap_width{};
 
   // Maximum 1D layered texture width
-  using max_texture_1d_layered_width_t = __attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
+  using max_texture_1d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
   static constexpr max_texture_1d_layered_width_t max_texture_1d_layered_width{};
 
   // Maximum layers in a 1D layered texture
-  using max_texture_1d_layered_layers_t = __attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
+  using max_texture_1d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
   static constexpr max_texture_1d_layered_layers_t max_texture_1d_layered_layers{};
 
   // Maximum 2D layered texture width
-  using max_texture_2d_layered_width_t = __attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
+  using max_texture_2d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
   static constexpr max_texture_2d_layered_width_t max_texture_2d_layered_width{};
 
   // Maximum 2D layered texture height
-  using max_texture_2d_layered_height_t = __attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
+  using max_texture_2d_layered_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
   static constexpr max_texture_2d_layered_height_t max_texture_2d_layered_height{};
 
   // Maximum layers in a 2D layered texture
-  using max_texture_2d_layered_layers_t = __attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
+  using max_texture_2d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
   static constexpr max_texture_2d_layered_layers_t max_texture_2d_layered_layers{};
 
   // Maximum cubemap layered texture width or height
-  using max_texture_cubemap_layered_width_t = __attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
+  using max_texture_cubemap_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
   static constexpr max_texture_cubemap_layered_width_t max_texture_cubemap_layered_width{};
 
   // Maximum layers in a cubemap layered texture
-  using max_texture_cubemap_layered_layers_t = __attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
+  using max_texture_cubemap_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
   static constexpr max_texture_cubemap_layered_layers_t max_texture_cubemap_layered_layers{};
 
   // Maximum 1D surface width
-  using max_surface_1d_width_t = __attr<::cudaDevAttrMaxSurface1DWidth>;
+  using max_surface_1d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DWidth>;
   static constexpr max_surface_1d_width_t max_surface_1d_width{};
 
   // Maximum 2D surface width
-  using max_surface_2d_width_t = __attr<::cudaDevAttrMaxSurface2DWidth>;
+  using max_surface_2d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DWidth>;
   static constexpr max_surface_2d_width_t max_surface_2d_width{};
 
   // Maximum 2D surface height
-  using max_surface_2d_height_t = __attr<::cudaDevAttrMaxSurface2DHeight>;
+  using max_surface_2d_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DHeight>;
   static constexpr max_surface_2d_height_t max_surface_2d_height{};
 
   // Maximum 3D surface width
-  using max_surface_3d_width_t = __attr<::cudaDevAttrMaxSurface3DWidth>;
+  using max_surface_3d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DWidth>;
   static constexpr max_surface_3d_width_t max_surface_3d_width{};
 
   // Maximum 3D surface height
-  using max_surface_3d_height_t = __attr<::cudaDevAttrMaxSurface3DHeight>;
+  using max_surface_3d_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DHeight>;
   static constexpr max_surface_3d_height_t max_surface_3d_height{};
 
   // Maximum 3D surface depth
-  using max_surface_3d_depth_t = __attr<::cudaDevAttrMaxSurface3DDepth>;
+  using max_surface_3d_depth_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DDepth>;
   static constexpr max_surface_3d_depth_t max_surface_3d_depth{};
 
   // Maximum 1D layered surface width
-  using max_surface_1d_layered_width_t = __attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
+  using max_surface_1d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
   static constexpr max_surface_1d_layered_width_t max_surface_1d_layered_width{};
 
   // Maximum layers in a 1D layered surface
-  using max_surface_1d_layered_layers_t = __attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
+  using max_surface_1d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
   static constexpr max_surface_1d_layered_layers_t max_surface_1d_layered_layers{};
 
   // Maximum 2D layered surface width
-  using max_surface_2d_layered_width_t = __attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
+  using max_surface_2d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
   static constexpr max_surface_2d_layered_width_t max_surface_2d_layered_width{};
 
   // Maximum 2D layered surface height
-  using max_surface_2d_layered_height_t = __attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
+  using max_surface_2d_layered_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
   static constexpr max_surface_2d_layered_height_t max_surface_2d_layered_height{};
 
   // Maximum layers in a 2D layered surface
-  using max_surface_2d_layered_layers_t = __attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
+  using max_surface_2d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
   static constexpr max_surface_2d_layered_layers_t max_surface_2d_layered_layers{};
 
   // Maximum cubemap surface width
-  using max_surface_cubemap_width_t = __attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
+  using max_surface_cubemap_width_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
   static constexpr max_surface_cubemap_width_t max_surface_cubemap_width{};
 
   // Maximum cubemap layered surface width
-  using max_surface_cubemap_layered_width_t = __attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
+  using max_surface_cubemap_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
   static constexpr max_surface_cubemap_layered_width_t max_surface_cubemap_layered_width{};
 
   // Maximum layers in a cubemap layered surface
-  using max_surface_cubemap_layered_layers_t = __attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
+  using max_surface_cubemap_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
   static constexpr max_surface_cubemap_layered_layers_t max_surface_cubemap_layered_layers{};
 
   // Maximum number of 32-bit registers available to a thread block
-  using max_registers_per_block_t = __attr<::cudaDevAttrMaxRegistersPerBlock>;
+  using max_registers_per_block_t = detail::__dev_attr<::cudaDevAttrMaxRegistersPerBlock>;
   static constexpr max_registers_per_block_t max_registers_per_block{};
 
   // Peak clock frequency in kilohertz
-  using clock_rate_t = __attr<::cudaDevAttrClockRate>;
+  using clock_rate_t = detail::__dev_attr<::cudaDevAttrClockRate>;
   static constexpr clock_rate_t clock_rate{};
 
   // Alignment requirement; texture base addresses aligned to textureAlign bytes
   // do not need an offset applied to texture fetches
-  using texture_alignment_t = __attr<::cudaDevAttrTextureAlignment>;
+  using texture_alignment_t = detail::__dev_attr<::cudaDevAttrTextureAlignment>;
   static constexpr texture_alignment_t texture_alignment{};
 
   // Pitch alignment requirement for 2D texture references bound to pitched memory
-  using texture_pitch_alignment_t = __attr<::cudaDevAttrTexturePitchAlignment>;
+  using texture_pitch_alignment_t = detail::__dev_attr<::cudaDevAttrTexturePitchAlignment>;
   static constexpr texture_pitch_alignment_t texture_pitch_alignment{};
 
   // true if the device can concurrently copy memory between host and device
   // while executing a kernel, or false if not
-  using gpu_overlap_t = __attr<::cudaDevAttrGpuOverlap>;
+  using gpu_overlap_t = detail::__dev_attr<::cudaDevAttrGpuOverlap>;
   static constexpr gpu_overlap_t gpu_overlap{};
 
   // Number of multiprocessors on the device
-  using multi_processor_count_t = __attr<::cudaDevAttrMultiProcessorCount>;
+  using multi_processor_count_t = detail::__dev_attr<::cudaDevAttrMultiProcessorCount>;
   static constexpr multi_processor_count_t multi_processor_count{};
 
   // true if there is a run time limit for kernels executed on the device, or
   // false if not
-  using kernel_exec_timeout_t = __attr<::cudaDevAttrKernelExecTimeout>;
+  using kernel_exec_timeout_t = detail::__dev_attr<::cudaDevAttrKernelExecTimeout>;
   static constexpr kernel_exec_timeout_t kernel_exec_timeout{};
 
   // true if the device is integrated with the memory subsystem, or false if not
-  using integrated_t = __attr<::cudaDevAttrIntegrated>;
+  using integrated_t = detail::__dev_attr<::cudaDevAttrIntegrated>;
   static constexpr integrated_t integrated{};
 
   // true if the d
-  using can_map_host_memory_t = __attr<::cudaDevAttrCanMapHostMemory>;
+  using can_map_host_memory_t = detail::__dev_attr<::cudaDevAttrCanMapHostMemory>;
   static constexpr can_map_host_memory_t can_map_host_memory{};
 
   // Compute mode is the compute mode that the device is currently in.
-  using compute_mode_t = __attr<::cudaDevAttrComputeMode>;
+  using compute_mode_t = detail::__dev_attr<::cudaDevAttrComputeMode>;
   static constexpr compute_mode_t compute_mode{};
 
   // true if the device supports executing multiple kernels within the same
   // context simultaneously, or false if not. It is not guaranteed that multiple
   // kernels will be resident on the device concurrently so this feature should
   // not be relied upon for correctness.
-  using concurrent_kernels_t = __attr<::cudaDevAttrConcurrentKernels>;
+  using concurrent_kernels_t = detail::__dev_attr<::cudaDevAttrConcurrentKernels>;
   static constexpr concurrent_kernels_t concurrent_kernels{};
 
   // true if error correction is enabled on the device, 0 if error correction is
   // disabled or not supported by the device
-  using ecc_enabled_t = __attr<::cudaDevAttrEccEnabled>;
+  using ecc_enabled_t = detail::__dev_attr<::cudaDevAttrEccEnabled>;
   static constexpr ecc_enabled_t ecc_enabled{};
 
   // PCI bus identifier of the device
-  using pci_bus_id_t = __attr<::cudaDevAttrPciBusId>;
+  using pci_bus_id_t = detail::__dev_attr<::cudaDevAttrPciBusId>;
   static constexpr pci_bus_id_t pci_bus_id{};
 
   // PCI device (also known as slot) identifier of the device
-  using pci_device_id_t = __attr<::cudaDevAttrPciDeviceId>;
+  using pci_device_id_t = detail::__dev_attr<::cudaDevAttrPciDeviceId>;
   static constexpr pci_device_id_t pci_device_id{};
 
   // true if the device is using a TCC driver. TCC is only available on Tesla
   // hardware running Windows Vista or later.
-  using tcc_driver_t = __attr<::cudaDevAttrTccDriver>;
+  using tcc_driver_t = detail::__dev_attr<::cudaDevAttrTccDriver>;
   static constexpr tcc_driver_t tcc_driver{};
 
   // Peak memory clock frequency in kilohertz
-  using memory_clock_rate_t = __attr<::cudaDevAttrMemoryClockRate>;
+  using memory_clock_rate_t = detail::__dev_attr<::cudaDevAttrMemoryClockRate>;
   static constexpr memory_clock_rate_t memory_clock_rate{};
 
   // Global memory bus width in bits
-  using global_memory_bus_width_t = __attr<::cudaDevAttrGlobalMemoryBusWidth>;
+  using global_memory_bus_width_t = detail::__dev_attr<::cudaDevAttrGlobalMemoryBusWidth>;
   static constexpr global_memory_bus_width_t global_memory_bus_width{};
 
   // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
-  using l2_cache_size_t = __attr<::cudaDevAttrL2CacheSize>;
+  using l2_cache_size_t = detail::__dev_attr<::cudaDevAttrL2CacheSize>;
   static constexpr l2_cache_size_t l2_cache_size{};
 
   // Maximum resident threads per multiprocessor
-  using max_threads_per_multi_processor_t = __attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
+  using max_threads_per_multi_processor_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
   static constexpr max_threads_per_multi_processor_t max_threads_per_multi_processor{};
 
   // true if the device shares a unified address space with the host, or false
   // if not
-  using unified_addressing_t = __attr<::cudaDevAttrUnifiedAddressing>;
+  using unified_addressing_t = detail::__dev_attr<::cudaDevAttrUnifiedAddressing>;
   static constexpr unified_addressing_t unified_addressing{};
 
   // Major compute capability version number
-  using compute_capability_major_t = __attr<::cudaDevAttrComputeCapabilityMajor>;
+  using compute_capability_major_t = detail::__dev_attr<::cudaDevAttrComputeCapabilityMajor>;
   static constexpr compute_capability_major_t compute_capability_major{};
 
   // Minor compute capability version number
-  using compute_capability_minor_t = __attr<::cudaDevAttrComputeCapabilityMinor>;
+  using compute_capability_minor_t = detail::__dev_attr<::cudaDevAttrComputeCapabilityMinor>;
   static constexpr compute_capability_minor_t compute_capability_minor{};
 
   // true if the device supports stream priorities, or false if not
-  using stream_priorities_supported_t = __attr<::cudaDevAttrStreamPrioritiesSupported>;
+  using stream_priorities_supported_t = detail::__dev_attr<::cudaDevAttrStreamPrioritiesSupported>;
   static constexpr stream_priorities_supported_t stream_priorities_supported{};
 
   // true if device supports caching globals in L1 cache, false if not
-  using global_l1_cache_supported_t = __attr<::cudaDevAttrGlobalL1CacheSupported>;
+  using global_l1_cache_supported_t = detail::__dev_attr<::cudaDevAttrGlobalL1CacheSupported>;
   static constexpr global_l1_cache_supported_t global_l1_cache_supported{};
 
   // true if device supports caching locals in L1 cache, false if not
-  using local_l1_cache_supported_t = __attr<::cudaDevAttrLocalL1CacheSupported>;
+  using local_l1_cache_supported_t = detail::__dev_attr<::cudaDevAttrLocalL1CacheSupported>;
   static constexpr local_l1_cache_supported_t local_l1_cache_supported{};
 
   // Maximum amount of shared memory available to a multiprocessor in bytes;
   // this amount is shared by all thread blocks simultaneously resident on a
   // multiprocessor
-  using max_shared_memory_per_multiprocessor_t = __attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
+  using max_shared_memory_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
   static constexpr max_shared_memory_per_multiprocessor_t max_shared_memory_per_multiprocessor{};
 
   // Maximum number of 32-bit registers available to a multiprocessor; this
   // number is shared by all thread blocks simultaneously resident on a
   // multiprocessor
-  using max_registers_per_multiprocessor_t = __attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
+  using max_registers_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
   static constexpr max_registers_per_multiprocessor_t max_registers_per_multiprocessor{};
 
   // true if device supports allocating managed memory, false if not
-  using managed_memory_t = __attr<::cudaDevAttrManagedMemory>;
+  using managed_memory_t = detail::__dev_attr<::cudaDevAttrManagedMemory>;
   static constexpr managed_memory_t managed_memory{};
 
   // true if device is on a multi-GPU board, false if not
-  using is_multi_gpu_board_t = __attr<::cudaDevAttrIsMultiGpuBoard>;
+  using is_multi_gpu_board_t = detail::__dev_attr<::cudaDevAttrIsMultiGpuBoard>;
   static constexpr is_multi_gpu_board_t is_multi_gpu_board{};
 
   // Unique identifier for a group of devices on the same multi-GPU board
-  using multi_gpu_board_group_id_t = __attr<::cudaDevAttrMultiGpuBoardGroupID>;
+  using multi_gpu_board_group_id_t = detail::__dev_attr<::cudaDevAttrMultiGpuBoardGroupID>;
   static constexpr multi_gpu_board_group_id_t multi_gpu_board_group_id{};
 
   // true if the link between the device and the host supports native atomic
   // operations
-  using host_native_atomic_supported_t = __attr<::cudaDevAttrHostNativeAtomicSupported>;
+  using host_native_atomic_supported_t = detail::__dev_attr<::cudaDevAttrHostNativeAtomicSupported>;
   static constexpr host_native_atomic_supported_t host_native_atomic_supported{};
 
   // Ratio of single precision performance (in floating-point operations per
   // second) to double precision performance
-  using single_to_double_precision_perf_ratio_t = __attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
+  using single_to_double_precision_perf_ratio_t = detail::__dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
   static constexpr single_to_double_precision_perf_ratio_t single_to_double_precision_perf_ratio{};
 
   // true if the device supports coherently accessing pageable memory without
   // calling cudaHostRegister on it, and false otherwise
-  using pageable_memory_access_t = __attr<::cudaDevAttrPageableMemoryAccess>;
+  using pageable_memory_access_t = detail::__dev_attr<::cudaDevAttrPageableMemoryAccess>;
   static constexpr pageable_memory_access_t pageable_memory_access{};
 
   // true if the device can coherently access managed memory concurrently with
   // the CPU, and false otherwise
-  using concurrent_managed_access_t = __attr<::cudaDevAttrConcurrentManagedAccess>;
+  using concurrent_managed_access_t = detail::__dev_attr<::cudaDevAttrConcurrentManagedAccess>;
   static constexpr concurrent_managed_access_t concurrent_managed_access{};
 
   // true if the device supports Compute Preemption, false if not
-  using compute_preemption_supported_t = __attr<::cudaDevAttrComputePreemptionSupported>;
+  using compute_preemption_supported_t = detail::__dev_attr<::cudaDevAttrComputePreemptionSupported>;
   static constexpr compute_preemption_supported_t compute_preemption_supported{};
 
   // true if the device can access host registered memory at the same virtual
   // address as the CPU, and false otherwise
-  using can_use_host_pointer_for_registered_mem_t = __attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
+  using can_use_host_pointer_for_registered_mem_t = detail::__dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
   static constexpr can_use_host_pointer_for_registered_mem_t can_use_host_pointer_for_registered_mem{};
 
   // true if the device supports launching cooperative kernels via
   // cudaLaunchCooperativeKernel, and false otherwise
-  using cooperative_launch_t = __attr<::cudaDevAttrCooperativeLaunch>;
+  using cooperative_launch_t = detail::__dev_attr<::cudaDevAttrCooperativeLaunch>;
   static constexpr cooperative_launch_t cooperative_launch{};
 
   // true if the device supports launching cooperative kernels via
   // cudaLaunchCooperativeKernelMultiDevice, and false otherwise
-  using cooperative_multi_device_launch_t = __attr<::cudaDevAttrCooperativeMultiDeviceLaunch>;
+  using cooperative_multi_device_launch_t = detail::__dev_attr<::cudaDevAttrCooperativeMultiDeviceLaunch>;
   static constexpr cooperative_multi_device_launch_t cooperative_multi_device_launch{};
 
   // true if the device supports flushing of outstanding remote writes, and
   // false otherwise
-  using can_flush_remote_writes_t = __attr<::cudaDevAttrCanFlushRemoteWrites>;
+  using can_flush_remote_writes_t = detail::__dev_attr<::cudaDevAttrCanFlushRemoteWrites>;
   static constexpr can_flush_remote_writes_t can_flush_remote_writes{};
 
   // true if the device supports host memory registration via cudaHostRegister,
   // and false otherwise
-  using host_register_supported_t = __attr<::cudaDevAttrHostRegisterSupported>;
+  using host_register_supported_t = detail::__dev_attr<::cudaDevAttrHostRegisterSupported>;
   static constexpr host_register_supported_t host_register_supported{};
 
   // true if the device accesses pageable memory via the host's page tables, and
   // false otherwise
-  using pageable_memory_access_uses_host_page_tables_t = __attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
+  using pageable_memory_access_uses_host_page_tables_t =
+    detail::__dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
   static constexpr pageable_memory_access_uses_host_page_tables_t pageable_memory_access_uses_host_page_tables{};
 
   // true if the host can directly access managed memory on the device without
   // migration, and false otherwise
-  using direct_managed_mem_access_from_host_t = __attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
+  using direct_managed_mem_access_from_host_t = detail::__dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
   static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{};
 
   // Maximum per block shared memory size on the device. This value can be opted
   // into when using cudaFuncSetAttribute
-  using max_shared_memory_per_block_optin_t = __attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
+  using max_shared_memory_per_block_optin_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
   static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{};
 
   // Maximum number of thread blocks that can reside on a multiprocessor
-  using max_blocks_per_multiprocessor_t = __attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
+  using max_blocks_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
   static constexpr max_blocks_per_multiprocessor_t max_blocks_per_multiprocessor{};
 
   // Maximum L2 persisting lines capacity setting in bytes
-  using max_persisting_l2_cache_size_t = __attr<::cudaDevAttrMaxPersistingL2CacheSize>;
+  using max_persisting_l2_cache_size_t = detail::__dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>;
   static constexpr max_persisting_l2_cache_size_t max_persisting_l2_cache_size{};
 
   // Maximum value of cudaAccessPolicyWindow::num_bytes
-  using max_access_policy_window_size_t = __attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
+  using max_access_policy_window_size_t = detail::__dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
   static constexpr max_access_policy_window_size_t max_access_policy_window_size{};
 
   // Shared memory reserved by CUDA driver per block in bytes
-  using reserved_shared_memory_per_block_t = __attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
+  using reserved_shared_memory_per_block_t = detail::__dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
   static constexpr reserved_shared_memory_per_block_t reserved_shared_memory_per_block{};
 
   // true if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
-  using sparse_cuda_array_supported_t = __attr<::cudaDevAttrSparseCudaArraySupported>;
+  using sparse_cuda_array_supported_t = detail::__dev_attr<::cudaDevAttrSparseCudaArraySupported>;
   static constexpr sparse_cuda_array_supported_t sparse_cuda_array_supported{};
 
   // Device supports using the cudaHostRegister flag cudaHostRegisterReadOnly to
   // register memory that must be mapped as read-only to the GPU
-  using host_register_read_only_supported_t = __attr<::cudaDevAttrHostRegisterReadOnlySupported>;
+  using host_register_read_only_supported_t = detail::__dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>;
   static constexpr host_register_read_only_supported_t host_register_read_only_supported{};
 
   // true if the device supports using the cudaMallocAsync and cudaMemPool
   // family of APIs, and false otherwise
-  using memory_pools_supported_t = __attr<::cudaDevAttrMemoryPoolsSupported>;
+  using memory_pools_supported_t = detail::__dev_attr<::cudaDevAttrMemoryPoolsSupported>;
   static constexpr memory_pools_supported_t memory_pools_supported{};
 
   // true if the device supports GPUDirect RDMA APIs, and false otherwise
-  using gpu_direct_rdma_supported_t = __attr<::cudaDevAttrGPUDirectRDMASupported>;
+  using gpu_direct_rdma_supported_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMASupported>;
   static constexpr gpu_direct_rdma_supported_t gpu_direct_rdma_supported{};
 
   // bitmask to be interpreted according to the
   // cudaFlushGPUDirectRDMAWritesOptions enum
-  using gpu_direct_rdma_flush_writes_options_t = __attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
+  using gpu_direct_rdma_flush_writes_options_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
   static constexpr gpu_direct_rdma_flush_writes_options_t gpu_direct_rdma_flush_writes_options{};
 
   // see the cudaGPUDirectRDMAWritesOrdering enum for numerical values
-  using gpu_direct_rdma_writes_ordering_t = __attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
+  using gpu_direct_rdma_writes_ordering_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
   static constexpr gpu_direct_rdma_writes_ordering_t gpu_direct_rdma_writes_ordering{};
 
   // Bitmask of handle types supported with mempool based IPC
-  using memory_pool_supported_handle_types_t = __attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
+  using memory_pool_supported_handle_types_t = detail::__dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
   static constexpr memory_pool_supported_handle_types_t memory_pool_supported_handle_types{};
 
   // true if the device supports deferred mapping CUDA arrays and CUDA mipmapped
   // arrays.
-  using deferred_mapping_cuda_array_supported_t = __attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
+  using deferred_mapping_cuda_array_supported_t = detail::__dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
   static constexpr deferred_mapping_cuda_array_supported_t deferred_mapping_cuda_array_supported{};
 
   // true if the device supports IPC Events, false otherwise.
-  using ipc_event_support_t = __attr<::cudaDevAttrIpcEventSupport>;
+  using ipc_event_support_t = detail::__dev_attr<::cudaDevAttrIpcEventSupport>;
   static constexpr ipc_event_support_t ipc_event_support{};
 
 #if CUDART_VERSION >= 12020
 
   // NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum
-  using numa_config_t = __attr<::cudaDevAttrNumaConfig>;
+  using numa_config_t = detail::__dev_attr<::cudaDevAttrNumaConfig>;
   static constexpr numa_config_t numa_config{};
 
   // NUMA node ID of the GPU memory
-  using numa_id_t = __attr<::cudaDevAttrNumaId>;
+  using numa_id_t = detail::__dev_attr<::cudaDevAttrNumaId>;
   static constexpr numa_id_t numa_id{};
 
 #endif // CUDART_VERSION >= 12020
diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh
index 35e0cfe2d4..5532e8f59b 100644
--- a/cudax/include/cuda/experimental/__device/device.cuh
+++ b/cudax/include/cuda/experimental/__device/device.cuh
@@ -52,6 +52,20 @@ struct __emplace_device
 class device : public device_ref
 {
 public:
+  struct attrs;
+
+  //! @brief For a given attribute, returns the type of the attribute value.
+  //!
+  //! @par Example
+  //! @code
+  //! using threads_per_block_t = device::attr_result_t<device::attrs::max_threads_per_block>;
+  //! static_assert(std::is_same_v<threads_per_block_t, int>);
+  //! @endcode
+  //!
+  //! @sa device::attrs
+  template <::cudaDeviceAttr _Attr>
+  using attr_result_t = typename detail::__dev_attr<_Attr>::type;
+
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 #  if defined(_CCCL_COMPILER_MSVC)
   // When __EDG__ is defined, std::construct_at will not permit constructing
@@ -91,6 +105,9 @@ private:
   mutable CUdevice __device{};
   mutable ::std::once_flag __init_once;
 
+  // TODO: put a mutable thread-safe (or thread_local) cache of device
+  // properties here.
+
   explicit constexpr device(int __id) noexcept
       : device_ref(__id)
   {}
diff --git a/cudax/include/cuda/experimental/__device/device_ref.cuh b/cudax/include/cuda/experimental/__device/device_ref.cuh
index 7f2635611f..91e4e90caa 100644
--- a/cudax/include/cuda/experimental/__device/device_ref.cuh
+++ b/cudax/include/cuda/experimental/__device/device_ref.cuh
@@ -27,6 +27,12 @@ namespace cuda::experimental
 {
 class device;
 
+namespace detail
+{
+template <::cudaDeviceAttr _Attr>
+struct __dev_attr;
+} // namespace detail
+
 //! @brief A non-owning representation of a CUDA device
 class device_ref
 {
@@ -34,37 +40,7 @@ class device_ref
 
   int __id_ = 0;
 
-  template <::cudaDeviceAttr _Attr>
-  struct __attr
-  {
-    using type = int;
-
-    _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept
-    {
-      return _Attr;
-    }
-
-    _CCCL_NODISCARD type operator()(device_ref __dev) const
-    {
-      return __dev.attr<_Attr>();
-    }
-  };
-
 public:
-  struct attrs;
-
-  //! @brief For a given attribute, returns the type of the attribute value.
-  //!
-  //! @par Example
-  //! @code
-  //! using threads_per_block_t = device_ref::attr_result_t<device_ref::attrs::max_threads_per_block>;
-  //! static_assert(std::is_same_v<threads_per_block_t, int>);
-  //! @endcode
-  //!
-  //! @sa device_ref::attrs
-  template <::cudaDeviceAttr _Attr>
-  using attr_result_t = typename __attr<_Attr>::type;
-
   //! @brief Create a `device_ref` object from a native device ordinal.
   /*implicit*/ constexpr device_ref(int __id) noexcept
       : __id_(__id)
@@ -78,27 +54,27 @@ public:
     return __id_;
   }
 
-  //! @brief Retrieve the specified attribute for the `device_ref`
+  //! @brief Retrieve the specified attribute for the device
   //!
-  //! @param __attr The attribute to query. See `device_ref::attrs` for the available
+  //! @param __attr The attribute to query. See `device::attrs` for the available
   //!        attributes.
   //!
   //! @throws cuda_error if the attribute query fails
   //!
-  //! @sa device_ref::attrs
+  //! @sa device::attrs
   template <::cudaDeviceAttr _Attr>
-  _CCCL_NODISCARD auto attr([[maybe_unused]] device_ref::__attr<_Attr> __attr) const
+  _CCCL_NODISCARD auto attr([[maybe_unused]] detail::__dev_attr<_Attr> __attr) const
   {
     int __value = 0;
     _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, _Attr, get());
-    return static_cast<typename device_ref::__attr<_Attr>::type>(__value);
+    return static_cast<typename detail::__dev_attr<_Attr>::type>(__value);
   }
 
   //! @overload
   template <::cudaDeviceAttr _Attr>
   _CCCL_NODISCARD auto attr() const
   {
-    return attr(__attr<_Attr>());
+    return attr(detail::__dev_attr<_Attr>());
   }
 };
 
diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu
index 6f772de08a..b98d05fc3b 100644
--- a/cudax/test/device/device_smoke.cu
+++ b/cudax/test/device/device_smoke.cu
@@ -177,7 +177,7 @@ TEST_CASE("Smoke", "[device]")
                             ::cudaGPUDirectRDMAWritesOrdering>();
     ::test_device_attribute<device::attrs::memory_pool_supported_handle_types,
                             ::cudaDevAttrMemoryPoolSupportedHandleTypes,
-                            unsigned int>();
+                            ::cudaMemAllocationHandleType>();
     ::test_device_attribute<device::attrs::deferred_mapping_cuda_array_supported,
                             ::cudaDevAttrDeferredMappingCudaArraySupported,
                             bool>();
@@ -225,6 +225,30 @@ TEST_CASE("Smoke", "[device]")
                      ordering == device::attrs::gpu_direct_rdma_writes_ordering.all_devices));
     }
 
+    SECTION("memory_pool_supported_handle_types")
+    {
+      STATIC_REQUIRE(::cudaMemHandleTypeNone == device::attrs::memory_pool_supported_handle_types.none);
+      STATIC_REQUIRE(::cudaMemHandleTypePosixFileDescriptor
+                     == device::attrs::memory_pool_supported_handle_types.posix_file_descriptor);
+      STATIC_REQUIRE(::cudaMemHandleTypeWin32 == device::attrs::memory_pool_supported_handle_types.win32);
+      STATIC_REQUIRE(::cudaMemHandleTypeWin32Kmt == device::attrs::memory_pool_supported_handle_types.win32_kmt);
+#if CUDART_VERSION >= 12040
+      STATIC_REQUIRE(::cudaMemHandleTypeFabric == 0x8);
+      STATIC_REQUIRE(::cudaMemHandleTypeFabric == device::attrs::memory_pool_supported_handle_types.fabric);
+#else
+      STATIC_REQUIRE(0x8 == device::attrs::memory_pool_supported_handle_types.fabric);
+#endif
+
+      constexpr int all_handle_types =
+        device::attrs::memory_pool_supported_handle_types.none
+        | device::attrs::memory_pool_supported_handle_types.posix_file_descriptor
+        | device::attrs::memory_pool_supported_handle_types.win32
+        | device::attrs::memory_pool_supported_handle_types.win32_kmt
+        | device::attrs::memory_pool_supported_handle_types.fabric;
+      auto handle_types = device_ref(0).attr(device::attrs::memory_pool_supported_handle_types);
+      CUDAX_REQUIRE(handle_types <= all_handle_types);
+    }
+
 #if CUDART_VERSION >= 12020
     SECTION("numa_config")
     {

From b0e09d04b070447bc7c5e13ecd6c962b3c6773ca Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Tue, 6 Aug 2024 20:15:38 -0700
Subject: [PATCH 06/33] Integrate Python docs (#2196)

* pass docs build options to repo.sh

* Integrate Python docs

* update CI

* Apply suggestions from code review

Co-authored-by: Georgii Evtushenko <evtushenko.georgy@gmail.com>

---------

Co-authored-by: Georgii Evtushenko <evtushenko.georgy@gmail.com>
---
 .github/actions/docs-build/action.yml       |  2 +
 docs/cpp.rst                                | 52 +++++++++++++++++++++
 docs/{pycuda => cuda_cooperative}/index.rst |  4 +-
 docs/gen_docs.bash                          |  7 +--
 docs/index.rst                              | 27 ++++-------
 docs/python.rst                             | 15 ++++++
 docs/repo.toml                              | 10 ++--
 7 files changed, 88 insertions(+), 29 deletions(-)
 create mode 100644 docs/cpp.rst
 rename docs/{pycuda => cuda_cooperative}/index.rst (89%)
 create mode 100644 docs/python.rst

diff --git a/.github/actions/docs-build/action.yml b/.github/actions/docs-build/action.yml
index 78af3d04a7..8b997f4741 100644
--- a/.github/actions/docs-build/action.yml
+++ b/.github/actions/docs-build/action.yml
@@ -36,6 +36,8 @@ runs:
         cp -rf ./docs/_build/docs/thrust/latest/* _site/thrust
         mkdir _site/cudax
         cp -rf ./docs/_build/docs/cudax/latest/* _site/cudax
+        mkdir _site/cuda_cooperative
+        cp -rf ./docs/_build/docs/cuda_cooperative/latest/* _site/cuda_cooperative
         ./docs/scrape_docs.bash ./_site
 
     # Update docs as workflow artifact:
diff --git a/docs/cpp.rst b/docs/cpp.rst
new file mode 100644
index 0000000000..453ab1e2f7
--- /dev/null
+++ b/docs/cpp.rst
@@ -0,0 +1,52 @@
+.. _cccl-cpp-libraries:
+
+CUDA C++ Core Libraries
+=======================
+
+.. toctree::
+   :hidden:
+   :maxdepth: 3
+
+   libcu++ <https://nvidia.github.io/cccl/libcudacxx/>
+   CUB <https://nvidia.github.io/cccl/cub/>
+   Thrust <https://nvidia.github.io/cccl/thrust/>
+   Cuda Experimental <https://nvidia.github.io/cccl/cudax/>
+
+Welcome to the CUDA Core Compute Libraries (CCCL) libraries for C++.
+
+The concept for the  CCCL C++ librarires grew organically out of the Thrust,
+CUB, and libcudacxx projects that were developed independently over the years
+with a similar goal: to provide high-quality, high-performance, and
+easy-to-use C++ abstractions for CUDA developers. Naturally, there was a lot
+of overlap among the three projects, and it became clear the community would
+be better served by unifying them into a single repository.
+
+- `libcu++ <https://nvidia.github.io/cccl/libcudacxx/>`__
+  is the CUDA C++ Standard Library. It provides an implementation of the C++
+  Standard Library that works in both host and device code. Additionally, it
+  provides abstractions for CUDA-specific hardware features like
+  synchronization primitives, cache control, atomics, and more.
+
+- `CUB <https://nvidia.github.io/cccl/cub/>`__
+  is a lower-level, CUDA-specific library designed for speed-of-light parallel
+  algorithms across all GPU architectures. In addition to device-wide
+  algorithms, it provides *cooperative algorithms* like block-wide reduction
+  and warp-wide scan, providing CUDA kernel developers with building blocks to
+  create speed-of-light, custom kernels.
+
+- `Thrust <https://nvidia.github.io/cccl/thrust/>`__
+  is the C++ parallel algorithms library which inspired the introduction of
+  parallel algorithms to the C++ Standard Library. Thrust's high-level
+  interface greatly enhances programmer productivity while enabling performance
+  portability between GPUs and multicore CPUs via configurable backends that
+  allow using multiple parallel programming frameworks (such as CUDA, TBB, and
+  OpenMP).
+
+- `Cuda Experimental <https://nvidia.github.io/cccl/cudax/>`__
+  is a library of exerimental features that are still in the design process.
+
+The main goal of the CCCL C++ libraries is to fill a similar role that the
+Standard C++ Library fills for Standard C++: provide general-purpose,
+speed-of-light tools to CUDA C++ developers, allowing them to focus on
+solving the problems that matter. Unifying these projects is the first step
+towards realizing that goal.
diff --git a/docs/pycuda/index.rst b/docs/cuda_cooperative/index.rst
similarity index 89%
rename from docs/pycuda/index.rst
rename to docs/cuda_cooperative/index.rst
index c9c0e227b8..0eb41f3c3d 100644
--- a/docs/pycuda/index.rst
+++ b/docs/cuda_cooperative/index.rst
@@ -1,6 +1,6 @@
-.. _pycudax-module:
+.. _cuda_cooperative-module:
 
-CUDA
+CUDA Cooperative
 ==================================================
 
 .. warning::
diff --git a/docs/gen_docs.bash b/docs/gen_docs.bash
index d879463741..bd2ddae6f8 100755
--- a/docs/gen_docs.bash
+++ b/docs/gen_docs.bash
@@ -1,7 +1,8 @@
 #!/usr/bin/env bash
 
-## This script just wraps launching a docs build within a container
-## Tag is passed on as the first argument ${1}
+# This script just wraps launching a repo docs build within a container
+#
+# Additional options, e.g --stage sphinx will be passed on to repo.sh
 
 set -e
 
@@ -36,7 +37,7 @@ if [ ! -n "$(find img -name '*.png')" ]; then
     done
 fi
 
-if ! ./repo.sh docs; then
+if ! ./repo.sh docs "$@"; then
     echo "!!! There were errors while generating"
     exit 1
 fi
diff --git a/docs/index.rst b/docs/index.rst
index a639b68b2b..1862ecb563 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,27 +1,16 @@
-CUDA C++ Core Libraries
-=======================
+CUDA Core Compute Libraries
+===========================
 
 .. toctree::
    :hidden:
    :maxdepth: 3
 
-   libcu++ <https://nvidia.github.io/cccl/libcudacxx/>
-   CUB <https://nvidia.github.io/cccl/cub/>
-   Thrust <https://nvidia.github.io/cccl/thrust/>
-   Cuda Experimental <https://nvidia.github.io/cccl/cudax/>
+   cpp
+   python
 
-Welcome to the CUDA C++ Core Libraries (CCCL) where our mission is to make CUDA C++ more delightful.
+Welcome to the CUDA Core Compute Libraries (CCCL) where our mission is to
+make CUDA C++ and Python more delightful.
 
-The concept for the CUDA C++ Core Libraries (CCCL) grew organically out of the Thrust, CUB, and libcudacxx projects that were developed independently over the years with a similar goal: to provide high-quality, high-performance, and easy-to-use C++ abstractions for CUDA developers.
-Naturally, there was a lot of overlap among the three projects, and it became clear the community would be better served by unifying them into a single repository.
+- :ref:`cccl-cpp-libraries`
 
-- `libcu++ <https://nvidia.github.io/cccl/libcudacxx/>`__ is the CUDA C++ Standard Library. It provides an implementation of the C++ Standard Library that works in both host and device code. Additionally, it provides abstractions for CUDA-specific hardware features like synchronization primitives, cache control, atomics, and more.
-
-- `CUB <https://nvidia.github.io/cccl/cub/>`__ is a lower-level, CUDA-specific library designed for speed-of-light parallel algorithms across all GPU architectures. In addition to device-wide algorithms, it provides *cooperative algorithms* like block-wide reduction and warp-wide scan, providing CUDA kernel developers with building blocks to create speed-of-light, custom kernels.
-
-- `Thrust <https://nvidia.github.io/cccl/thrust/>`__ is the C++ parallel algorithms library which inspired the introduction of parallel algorithms to the C++ Standard Library. Thrust's high-level interface greatly enhances programmer productivity while enabling performance portability between GPUs and multicore CPUs via configurable backends that allow using multiple parallel programming frameworks (such as CUDA, TBB, and OpenMP).
-
-- `Cuda Experimental <https://nvidia.github.io/cccl/cudax/>`__ is a library of exerimental features that are still in the design process.
-
-The main goal of CCCL is to fill a similar role that the Standard C++ Library fills for Standard C++: provide general-purpose, speed-of-light tools to CUDA C++ developers, allowing them to focus on solving the problems that matter.
-Unifying these projects is the first step towards realizing that goal.
+- :ref:`cccl-python-libraries`
diff --git a/docs/python.rst b/docs/python.rst
new file mode 100644
index 0000000000..b0b9c5b73f
--- /dev/null
+++ b/docs/python.rst
@@ -0,0 +1,15 @@
+.. _cccl-python-libraries:
+
+CUDA Python Core Libraries
+==========================
+
+.. toctree::
+   :hidden:
+   :maxdepth: 3
+
+   cuda.cooperative <https://nvidia.github.io/cccl/cuda_cooperative/>
+
+Welcome to the CUDA Core Compute Libraries (CCCL) libraries for Python.
+
+- `cuda.cooperative <https://nvidia.github.io/cccl/cuda_cooperative/>`__
+  is a still-experimental library exposing cooperative algorithms to Python.
diff --git a/docs/repo.toml b/docs/repo.toml
index 74ebb0be7d..0741089ceb 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -25,14 +25,14 @@ sphinx_exclude_patterns = [
     "VERSION.md",
 ]
 
-project_build_order = [ "libcudacxx", "cudax", "cub", "thrust", "cccl", "pycuda" ]
+project_build_order = [ "libcudacxx", "cudax", "cub", "thrust", "cccl", "cuda_cooperative" ]
 
 # deps can be used to link to other projects' documentation
 deps = [
     [ "libcudacxx", "_build/docs/libcudacxx/latest" ],
     [ "cub", "_build/docs/cub/latest" ],
     [ "thrust", "_build/docs/thrust/latest" ],
-    [ "pycuda", "_build/docs/pycuda/latest" ],
+    [ "cuda_cooperative", "_build/docs/cuda_cooperative/latest" ],
 ]
 
 [repo_docs.projects.libcudacxx]
@@ -281,9 +281,9 @@ doxygen_conf_extra = """
   STRIP_FROM_PATH        = ../../thrust
 """
 
-[repo_docs.projects.pycuda]
-name = "pycuda"
-docs_root = "pycuda"
+[repo_docs.projects.cuda_cooperative]
+name = "cuda.cooperative"
+docs_root = "cuda_cooperative"
 logo = "../img/logo.png"
 
 repo_url         = "https://github.com/NVIDIA/cccl/python/cuda"

From 62336adbce4ee527a7bcd3b086b649822f8c95d4 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Wed, 7 Aug 2024 01:24:30 -0700
Subject: [PATCH 07/33] [FEA] Atomics codegen refactor (#1993)

* Initial draft of new atomics backend

* Change atomic fetch ops back to tag dispatch

* Save wip

* Add load/store and support for MMIO

* Begin working on exch

* Enable formatting exchange

* Several signed-ness fixes

* Make atomics ptx tests build. Lit tests are a WIP.

* Fix load/store, some volatileness, and min/max

* Formatting and enabled codegen in all builds

* Make integral.pass.cpp pass

* Make the rest of the atomics tests pass

* Use 128b ld/st instead of vector load as it is not atomic across the whole atom

* Fix copy-paste mistake in load/store

* Whitespace fixup

* Fix 128b .exch using .cas operands

* Make codegen link fmt as PRIVATE

Co-authored-by: Allison Piper <apiper@nvidia.com>

* Simplify MMIO down to a static array.

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>

* Static -> Inline for codegen functions. Replace endl with '\n'.

* Supply the output stream directly to `fmt::format`

* Update fmtlib.

* Revert `fmt::format(out...)` changes. They don't work on MSVC.

* Fixup libcudacxx codegen CMake stuff

* Remove sneaky cstdef include that was auto-added

* [pre-commit.ci] auto code formatting

---------

Co-authored-by: Allison Piper <apiper@nvidia.com>
Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CMakePresets.json                             |     4 +-
 libcudacxx/CMakeLists.txt                     |     2 +-
 libcudacxx/codegen/CMakeLists.txt             |    28 +-
 libcudacxx/codegen/codegen.cpp                |   527 +-
 .../codegen/generators/compare_and_swap.h     |   175 +
 libcudacxx/codegen/generators/definitions.h   |   193 +
 libcudacxx/codegen/generators/exchange.h      |   173 +
 libcudacxx/codegen/generators/fence.h         |   110 +
 libcudacxx/codegen/generators/fetch_ops.h     |   217 +
 libcudacxx/codegen/generators/header.h        |    80 +
 libcudacxx/codegen/generators/ld_st.h         |   353 +
 .../cuda/std/__atomic/functions/common.h      |    54 +
 .../std/__atomic/functions/cuda_ptx_derived.h |   514 +-
 .../__atomic/functions/cuda_ptx_generated.h   | 10729 ++++++----------
 .../functions/cuda_ptx_generated_helper.h     |   155 +
 .../cuda/std/__atomic/functions/host.h        |    25 +-
 libcudacxx/include/cuda/std/__atomic/scopes.h |     2 +
 libcudacxx/test/atomic_codegen/CMakeLists.txt |     8 +-
 .../atomic_codegen/atomic_add_non_volatile.cu |     6 +-
 .../atomic_store_non_volatile.cu              |     2 +-
 .../atomic_codegen/atomic_sub_non_volatile.cu |     6 +-
 21 files changed, 5876 insertions(+), 7487 deletions(-)
 create mode 100644 libcudacxx/codegen/generators/compare_and_swap.h
 create mode 100644 libcudacxx/codegen/generators/definitions.h
 create mode 100644 libcudacxx/codegen/generators/exchange.h
 create mode 100644 libcudacxx/codegen/generators/fence.h
 create mode 100644 libcudacxx/codegen/generators/fetch_ops.h
 create mode 100644 libcudacxx/codegen/generators/header.h
 create mode 100644 libcudacxx/codegen/generators/ld_st.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/functions/common.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h

diff --git a/CMakePresets.json b/CMakePresets.json
index 004d57b2ba..61cb88eca8 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -478,7 +478,7 @@
       ],
       "filter": {
         "exclude": {
-          "name": "^libcudacxx\\.test\\.lit$"
+          "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.diff)$"
         }
       }
     },
@@ -487,7 +487,7 @@
       "configurePreset": "libcudacxx-codegen",
       "filter": {
         "include": {
-          "name": "^libcudacxx\\.atomics\\.codegen.*$"
+          "name": "^libcudacxx\\.test\\.atomics\\.codegen.*$"
         }
       }
     },
diff --git a/libcudacxx/CMakeLists.txt b/libcudacxx/CMakeLists.txt
index 989fd642f0..7883110573 100644
--- a/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/CMakeLists.txt
@@ -33,7 +33,7 @@ include(CTest)
 enable_testing()
 
 # Add codegen module
-option(libcudacxx_ENABLE_CODEGEN "Enable ctest-based testing." OFF)
+option(libcudacxx_ENABLE_CODEGEN "Enable libcudacxx's atomics backend codegen and tests." OFF)
 if (libcudacxx_ENABLE_CODEGEN)
   add_subdirectory(codegen)
 endif()
diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt
index 3477f988af..05821a4eca 100644
--- a/libcudacxx/codegen/CMakeLists.txt
+++ b/libcudacxx/codegen/CMakeLists.txt
@@ -1,11 +1,11 @@
 ## Codegen adds the following build targets
 #    libcudacxx.atomics.codegen
-#    libcudacxx.atomics.codegen.execute
 #    libcudacxx.atomics.codegen.install
 ## Test targets:
-#    libcudacxx.atomics.codegen.diff
+#    libcudacxx.test.atomics.codegen.diff
 
-add_custom_target(libcudacxx.atomics.codegen)
+include(${CMAKE_SOURCE_DIR}/cub/cmake/CPM.cmake)
+CPMAddPackage("gh:fmtlib/fmt#11.0.1")
 
 add_executable(
     codegen
@@ -13,32 +13,32 @@ add_executable(
     codegen.cpp
 )
 
-target_compile_features(
-    codegen PRIVATE cxx_std_14
-)
+target_link_libraries(codegen PRIVATE fmt)
 
-add_dependencies(libcudacxx.atomics.codegen codegen)
+set_property(TARGET codegen PROPERTY CXX_STANDARD 17)
 
 set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h")
 set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/functions")
 
 add_custom_target(
-    libcudacxx.atomics.codegen.execute
-    COMMAND codegen
+    libcudacxx.atomics.codegen
+    COMMAND codegen "${atomic_generated_output}"
     BYPRODUCTS "${atomic_generated_output}"
 )
 
-add_dependencies(libcudacxx.atomics.codegen libcudacxx.atomics.codegen.execute)
-
 add_custom_target(
     libcudacxx.atomics.codegen.install
     COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/cuda_ptx_generated.h"
+    DEPENDS libcudacxx.atomics.codegen
     BYPRODUCTS "${atomic_install_location}/cuda_ptx_generated.h"
 )
 
-add_dependencies(libcudacxx.atomics.codegen.install libcudacxx.atomics.codegen.execute)
-
 add_test(
-    NAME libcudacxx.atomics.codegen.diff
+    NAME libcudacxx.test.atomics.codegen.diff
     COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/cuda_ptx_generated.h" "${atomic_generated_output}"
 )
+
+set_tests_properties(
+    libcudacxx.test.atomics.codegen.diff
+    PROPERTIES REQUIRED_FILES "${atomic_generated_output}"
+)
diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index 0d4a7a7296..5a5e36454f 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -4,521 +4,42 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
 #include <fstream>
-#include <map>
-#include <string>
-#include <vector>
+#include <iostream>
+#include <ostream>
+
+#include "generators/compare_and_swap.h"
+#include "generators/exchange.h"
+#include "generators/fence.h"
+#include "generators/fetch_ops.h"
+#include "generators/header.h"
+#include "generators/ld_st.h"
 
 using namespace std::string_literals;
 
-int main()
+int main(int argc, char** argv)
 {
-  std::map<std::string, std::string> scopes{{"system", ".sys"}, {"device", ".gpu"}, {"block", ".cta"}};
-
-  std::map<std::string, std::string> membar_scopes{{"system", ".sys"}, {"device", ".gl"}, {"block", ".cta"}};
-
-  std::map<std::string, std::string> fence_semantics{{"sc", ".sc"}, {"acq_rel", ".acq_rel"}};
-
-  bool const ld_as_atom = false;
-
-  std::vector<int> ld_sizes{
-    // 8,
-    // 16,
-    32,
-    64};
-  std::map<std::string, std::string> ld_semantics{
-    {"relaxed", ".relaxed"}, {"acquire", ".acquire"}, {"volatile", ".volatile"}};
-
-  std::vector<int> st_sizes{
-    // 8,
-    // 16,
-    32,
-    64};
-  std::map<std::string, std::string> st_semantics{
-    {"relaxed", ".relaxed"}, {"release", ".release"}, {"volatile", ".volatile"}};
-
-  std::vector<int> rmw_sizes{32, 64};
-  std::map<std::string, std::string> rmw_semantics{
-    {"relaxed", ".relaxed"},
-    {"acquire", ".acquire"},
-    {"release", ".release"},
-    {"acq_rel", ".acq_rel"},
-    {"volatile", ""}};
-  std::vector<std::string> rmw_classes{"bitwise", "arithmetic"};
-  std::map<std::string, std::map<std::string, std::string>> rmw_operations{
-    {"bitwise", std::map<std::string, std::string>{{"fetch_and", ".and"}, {"fetch_or", ".or"}, {"fetch_xor", ".xor"}}},
-    {"arithmetic",
-     std::map<std::string, std::string>{
-       {"exchange", ".exch"},
-       {"compare_exchange", ".cas"},
-       {"fetch_add", ".add"},
-       {"fetch_sub", ".add"},
-       {"fetch_max", ".max"},
-       {"fetch_min", ".min"}}}};
-  std::map<std::string, std::map<std::string, std::string>> rmw_types{
-    {"bitwise", std::map<std::string, std::string>{{"", ".b"}}},
-    {"arithmetic", std::map<std::string, std::string>{{"u", ".u"}, {"s", ".s"}, {"f", ".f"}}}};
-
-  std::vector<std::string> cv_qualifier{"volatile ", ""};
+  std::fstream filestream;
 
-  std::ofstream out("cuda_ptx_generated.h");
-
-  out << R"XXX(//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
-// clang-format off
-
-#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
-#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_signed.h>
-#include <cuda/std/__type_traits/is_unsigned.h>
-
-#include <cuda/std/__atomic/scopes.h>
-#include <cuda/std/__atomic/order.h>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-#if defined(_CCCL_CUDA_COMPILER)
-
-)XXX";
-
-  auto scopenametag = [&](auto scope) {
-    return "__thread_scope_" + scope + "_tag";
-  };
-  auto fencename = [&](auto sem, auto scope) {
-    return "__cuda_fence_" + sem + "_" + scope;
-  };
-  auto registers = [&](auto type_literal, auto type_size) {
-    if (type_literal == "f")
-    {
-      return (type_size == 32) ? "f" : "d";
-    }
-    else
-    {
-      return (type_size == 32) ? "r" : "l";
-    }
-  };
-
-  for (auto& s : scopes)
+  if (argc == 2)
   {
-    out << "static inline _CCCL_DEVICE void __cuda_membar_" << s.first << "() { asm volatile(\"membar"
-        << membar_scopes[s.first] << ";\":::\"memory\"); }\n";
-    for (auto& sem : fence_semantics)
-    {
-      out << "static inline _CCCL_DEVICE void " << fencename(sem.first, s.first) << "() { asm volatile(\"fence"
-          << sem.second << s.second << ";\":::\"memory\"); }\n";
-    }
-    out << "static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, " << scopenametag(s.first)
-        << ") {\n";
-    out << "  NV_DISPATCH_TARGET(\n";
-    out << "    NV_PROVIDES_SM_70, (\n";
-    out << "      switch (__memorder) {\n";
-    out << "        case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); break;\n";
-    out << "        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-    out << "        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();\n";
-    out << "        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();\n";
-    out << "        case __ATOMIC_RELEASE: " << fencename("acq_rel"s, s.first) << "(); break;\n";
-    out << "        case __ATOMIC_RELAXED: break;\n";
-    out << "        default: assert(0);\n";
-    out << "      }\n";
-    out << "    ),\n";
-    out << "    NV_IS_DEVICE, (\n";
-    out << "      switch (__memorder) {\n";
-    out << "        case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n";
-    out << "        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-    out << "        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();\n";
-    out << "        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();\n";
-    out << "        case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); break;\n";
-    out << "        case __ATOMIC_RELAXED: break;\n";
-    out << "        default: assert(0);\n";
-    out << "      }\n";
-    out << "    )\n";
-    out << "  )\n";
-    out << "}\n";
-    for (auto& sz : ld_sizes)
-    {
-      for (auto& sem : ld_semantics)
-      {
-        out << "template<class _CUDA_A, class _CUDA_B> ";
-        out << "static inline _CCCL_DEVICE void __cuda_load_" << sem.first << "_" << sz << "_" << s.first
-            << "(_CUDA_A __ptr, _CUDA_B& __dst) {";
-        if (ld_as_atom)
-        {
-          out << "asm volatile(\"atom.add" << (sem.first == "volatile" ? "" : sem.second.c_str()) << s.second << ".u"
-              << sz << " %0, [%1], 0;\" : ";
-        }
-        else
-        {
-          out << "asm volatile(\"ld" << sem.second << (sem.first == "volatile" ? "" : s.second.c_str()) << ".b" << sz
-              << " %0,[%1];\" : ";
-        }
-        out << "\"=" << registers("b", sz) << "\"(__dst) : \"l\"(__ptr)";
-        out << " : \"memory\"); }\n";
-      }
-      for (auto& cv : cv_qualifier)
-      {
-        out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8 << ", int> = 0>\n";
-        out << "_CCCL_DEVICE void __atomic_load_cuda(const " << cv << "_Type *__ptr, _Type *__ret, int __memorder, "
-            << scopenametag(s.first) << ") {\n";
-        out << "    uint" << sz << "_t __tmp = 0;\n";
-        out << "    NV_DISPATCH_TARGET(\n";
-        out << "      NV_PROVIDES_SM_70, (\n";
-        out << "        switch (__memorder) {\n";
-        out << "          case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_ACQUIRE: __cuda_load_acquire_" << sz << "_" << s.first
-            << "(__ptr, __tmp); break;\n";
-        out << "          case __ATOMIC_RELAXED: __cuda_load_relaxed_" << sz << "_" << s.first
-            << "(__ptr, __tmp); break;\n";
-        out << "          default: assert(0);\n";
-        out << "        }\n";
-        out << "      ),\n";
-        out << "      NV_IS_DEVICE, (\n";
-        out << "        switch (__memorder) {\n";
-        out << "          case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_ACQUIRE: __cuda_load_volatile_" << sz << "_" << s.first
-            << "(__ptr, __tmp); __cuda_membar_" << s.first << "(); break;\n";
-        out << "          case __ATOMIC_RELAXED: __cuda_load_volatile_" << sz << "_" << s.first
-            << "(__ptr, __tmp); break;\n";
-        out << "          default: assert(0);\n";
-        out << "        }\n";
-        out << "      )\n";
-        out << "    )\n";
-        out << "    memcpy(__ret, &__tmp, " << sz / 8 << ");\n";
-        out << "}\n";
-      }
-    }
-    for (auto& sz : st_sizes)
-    {
-      for (auto& sem : st_semantics)
-      {
-        out << "template<class _CUDA_A, class _CUDA_B> ";
-        out << "static inline _CCCL_DEVICE void __cuda_store_" << sem.first << "_" << sz << "_" << s.first
-            << "(_CUDA_A __ptr, _CUDA_B __src) { ";
-        out << "asm volatile(\"st" << sem.second << (sem.first == "volatile" ? "" : s.second.c_str()) << ".b" << sz
-            << " [%0], %1;\" :: ";
-        out << "\"l\"(__ptr),\"" << registers("b", sz) << "\"(__src)";
-        out << " : \"memory\"); }\n";
-      }
-      for (auto& cv : cv_qualifier)
-      {
-        out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8 << ", int> = 0>\n";
-        out << "_CCCL_DEVICE void __atomic_store_cuda(" << cv << "_Type *__ptr, _Type *__val, int __memorder, "
-            << scopenametag(s.first) << ") {\n";
-        out << "    uint" << sz << "_t __tmp = 0;\n";
-        out << "    memcpy(&__tmp, __val, " << sz / 8 << ");\n";
-        out << "    NV_DISPATCH_TARGET(\n";
-        out << "      NV_PROVIDES_SM_70, (\n";
-        out << "        switch (__memorder) {\n";
-        out << "          case __ATOMIC_RELEASE: __cuda_store_release_" << sz << "_" << s.first
-            << "(__ptr, __tmp); break;\n";
-        out << "          case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_RELAXED: __cuda_store_relaxed_" << sz << "_" << s.first
-            << "(__ptr, __tmp); break;\n";
-        out << "          default: assert(0);\n";
-        out << "        }\n";
-        out << "      ),\n";
-        out << "      NV_IS_DEVICE, (\n";
-        out << "        switch (__memorder) {\n";
-        out << "          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_RELAXED: __cuda_store_volatile_" << sz << "_" << s.first
-            << "(__ptr, __tmp); break;\n";
-        out << "          default: assert(0);\n";
-        out << "        }\n";
-        out << "      )\n";
-        out << "    )\n";
-        out << "}\n";
-      }
-    }
-    for (auto& sz : rmw_sizes)
-    {
-      for (auto& cl : rmw_classes)
-      {
-        for (auto& rmw : rmw_operations[cl])
-        {
-          for (auto& type : rmw_types[cl])
-          {
-            // fetch_min/fetch_max for fp types are derived functions
-            if (type.first == "f" && (rmw.first == "fetch_max" || rmw.first == "fetch_min"))
-            {
-              continue;
-            }
-            if (type.first == "s"
-                && (rmw.first == "fetch_add" || rmw.first == "fetch_sub" || rmw.first == "compare_exchange"
-                    || rmw.first == "exchange"))
-            {
-              continue;
-            }
-            for (auto& sem : rmw_semantics)
-            {
-              if (rmw.first == "compare_exchange")
-              {
-                out << "template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> ";
-              }
-              else
-              {
-                out << "template<class _CUDA_A, class _CUDA_B, class _CUDA_C> ";
-              }
-              out << "static inline _CCCL_DEVICE void __cuda_" << rmw.first << "_" << sem.first << "_" << type.first
-                  << sz << "_" << s.first << "(";
-              if (rmw.first == "compare_exchange")
-              {
-                out << "_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op";
-              }
-              else
-              {
-                out << "_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op";
-              }
-              out << ") { ";
-              if (rmw.first == "fetch_sub")
-              {
-                out << "__op = -__op;" << std::endl;
-              }
-              if (rmw.first == "compare_exchange")
-              {
-                out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << ".b" << sz << " ";
-                out << "%0,[%1],%2,%3";
-              }
-              else if (rmw.first == "exchange")
-              {
-                out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << ".b" << sz << " ";
-                out << "%0,[%1],%2";
-              }
-              else
-              {
-                out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << type.second << sz << " ";
-                out << "%0,[%1],%2";
-              }
-              out << ";\" : ";
-              if (rmw.first == "compare_exchange")
-              {
-                out << "\"=" << registers(type.first, sz) << "\"(__dst) : \"l\"(__ptr),\"" << registers(type.first, sz)
-                    << "\"(__cmp),\"" << registers(type.first, sz) << "\"(__op)";
-              }
-              else
-              {
-                out << "\"=" << registers(type.first, sz) << "\"(__dst) : \"l\"(__ptr),\"" << registers(type.first, sz)
-                    << "\"(__op)";
-              }
-              out << " : \"memory\"); }\n";
-            }
-            for (auto& cv : cv_qualifier)
-            {
-              out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8;
-              if (type.first == "f")
-              {
-                out << " && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>\n";
-              }
-              else if (rmw.first == "fetch_max" || rmw.first == "fetch_min")
-              {
-                if (type.first == "u")
-                {
-                  out << " && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> "
-                         "= 0>\n";
-                }
-                else if (type.first == "s")
-                {
-                  out << " && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = "
-                         "0>\n";
-                }
-              }
-              else if (type.first == "u")
-              {
-                out << " && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>\n";
-              }
-              else
-              {
-                out << ", int> = 0>\n";
-              }
-              if (rmw.first == "compare_exchange")
-              {
-                out << "_CCCL_DEVICE bool __atomic_compare_exchange_cuda(" << cv
-                    << "void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int "
-                       "__failure_memorder, "
-                    << scopenametag(s.first) << ") {\n";
-                out << "    auto __old = *__expected;\n";
-                out << "    NV_DISPATCH_TARGET(\n";
-                out << "      NV_PROVIDES_SM_70, (\n";
-                out << "        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n";
-                out << "          case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_" << type.first << sz << "_"
-                    << s.first << "(__ptr, *__expected, __old, __desired); break;\n";
-                out << "          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_" << type.first << sz << "_"
-                    << s.first << "(__ptr, *__expected, __old, __desired); break;\n";
-                out << "          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_" << type.first << sz << "_"
-                    << s.first << "(__ptr, *__expected, __old, __desired); break;\n";
-                out << "          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_" << type.first << sz << "_"
-                    << s.first << "(__ptr, *__expected, __old, __desired); break;\n";
-                out << "          default: assert(0);\n";
-                out << "        }\n";
-                out << "      ),\n";
-                out << "      NV_IS_DEVICE, (\n";
-                out << "        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n";
-                out << "          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_" << type.first << sz << "_"
-                    << s.first << "(__ptr, *__expected, __old, __desired); __cuda_membar_" << s.first << "(); break;\n";
-                out << "          case __ATOMIC_RELEASE: __cuda_membar_" << s.first
-                    << "(); __cuda_compare_exchange_volatile_" << type.first << sz << "_" << s.first
-                    << "(__ptr, *__expected, __old, __desired); break;\n";
-                out << "          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_" << type.first << sz << "_"
-                    << s.first << "(__ptr, *__expected, __old, __desired); break;\n";
-                out << "          default: assert(0);\n";
-                out << "        }\n";
-                out << "      )\n";
-                out << "    )\n";
-                out << "    return (__old == *__expected);\n";
-                out << "}\n";
-              }
-              else
-              {
-                if (rmw.first == "exchange")
-                {
-                  out
-                    << "_CCCL_DEVICE void __atomic_exchange_cuda(" << cv
-                    << "void *__ptr, _Type *__val, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n";
-                  out << "    _Type __tmp = *__val;\n";
-                }
-                else
-                {
-                  out << "_CCCL_DEVICE _Type __atomic_" << rmw.first << "_cuda(" << cv
-                      << "_Type *__ptr, _Type __val, int __memorder, " << scopenametag(s.first) << ") {\n";
-                  out << "    _Type __tmp = __val;\n";
-                }
-                out << "    NV_DISPATCH_TARGET(\n";
-                out << "      NV_PROVIDES_SM_70, (\n";
-                out << "        switch (__memorder) {\n";
-                out << "          case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_acquire_" << type.first << sz << "_"
-                    << s.first << "(__ptr, __tmp, __tmp); break;\n";
-                out << "          case __ATOMIC_ACQ_REL: __cuda_" << rmw.first << "_acq_rel_" << type.first << sz << "_"
-                    << s.first << "(__ptr, __tmp, __tmp); break;\n";
-                out << "          case __ATOMIC_RELEASE: __cuda_" << rmw.first << "_release_" << type.first << sz << "_"
-                    << s.first << "(__ptr, __tmp, __tmp); break;\n";
-                out << "          case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_relaxed_" << type.first << sz << "_"
-                    << s.first << "(__ptr, __tmp, __tmp); break;\n";
-                out << "          default: assert(0);\n";
-                out << "        }\n";
-                out << "      ),\n";
-                out << "      NV_IS_DEVICE, (\n";
-                out << "        switch (__memorder) {\n";
-                out << "          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-                out << "          case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_volatile_" << type.first << sz
-                    << "_" << s.first << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n";
-                out << "          case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_" << rmw.first
-                    << "_volatile_" << type.first << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n";
-                out << "          case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_volatile_" << type.first << sz
-                    << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n";
-                out << "          default: assert(0);\n";
-                out << "        }\n";
-                out << "      )\n";
-                out << "    )\n";
-                if (rmw.first == "exchange")
-                {
-                  out << "    memcpy(__ret, &__tmp, " << sz / 8 << ");\n";
-                }
-                else
-                {
-                  out << "    return __tmp;\n";
-                }
-                out << "}\n";
-              }
-            }
-          }
-        }
-      }
-    }
-    for (auto& cv : cv_qualifier)
-    {
-      std::vector<std::string> addsub{"add", "sub"};
-      for (auto& op : addsub)
-      {
-        out << "template<class _Type>\n";
-        out << "_CCCL_DEVICE _Type* __atomic_fetch_" << op << "_cuda(_Type *" << cv
-            << "*__ptr, ptrdiff_t __val, int __memorder, " << scopenametag(s.first) << ") {\n";
-        out << "    _Type* __ret;\n";
-        out << "    uint64_t __tmp = 0;\n";
-        out << "    memcpy(&__tmp, &__val, 8);\n";
-        if (op == "sub")
-        {
-          out << "    __tmp = -__tmp;\n";
-        }
-        out << "    __tmp *= sizeof(_Type);\n";
-        out << "    NV_DISPATCH_TARGET(\n";
-        out << "      NV_PROVIDES_SM_70, (\n";
-        out << "        switch (__memorder) {\n";
-        out << "          case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_" << s.first
-            << "(__ptr, __tmp, __tmp); break;\n";
-        out << "          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_" << s.first
-            << "(__ptr, __tmp, __tmp); break;\n";
-        out << "          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_" << s.first
-            << "(__ptr, __tmp, __tmp); break;\n";
-        out << "          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_" << s.first
-            << "(__ptr, __tmp, __tmp); break;\n";
-        out << "        }\n";
-        out << "      ),\n";
-        out << "      NV_IS_DEVICE, (\n";
-        out << "        switch (__memorder) {\n";
-        out << "          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
-        out << "          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_" << s.first
-            << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n";
-        out << "          case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_fetch_add_volatile_u64_"
-            << s.first << "(__ptr, __tmp, __tmp); break;\n";
-        out << "          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_" << s.first
-            << "(__ptr, __tmp, __tmp); break;\n";
-        out << "          default: assert(0);\n";
-        out << "        }\n";
-        out << "      )\n";
-        out << "    )\n";
-        out << "    memcpy(&__ret, &__tmp, 8);\n";
-        out << "    return __ret;\n";
-        out << "}\n";
-      }
-    }
+    filestream.open(argv[1], filestream.out);
   }
 
-  out << "\n#endif // defined(_CCCL_CUDA_COMPILER)\n";
-  out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n";
-  out << "\n#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H\n";
-  out << "\n// clang-format on\n";
+  std::ostream& stream = filestream.is_open() ? filestream : std::cout;
+
+  FormatHeader(stream);
+  FormatFence(stream);
+  FormatLoad(stream);
+  FormatStore(stream);
+  FormatCompareAndSwap(stream);
+  FormatExchange(stream);
+  FormatFetchOps(stream);
+  FormatTail(stream);
 
   return 0;
 }
diff --git a/libcudacxx/codegen/generators/compare_and_swap.h b/libcudacxx/codegen/generators/compare_and_swap.h
new file mode 100644
index 0000000000..5a970735c0
--- /dev/null
+++ b/libcudacxx/codegen/generators/compare_and_swap.h
@@ -0,0 +1,175 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef COMPARED_AND_SWAP_H
+#define COMPARED_AND_SWAP_H
+
+#include <string>
+
+#include "definitions.h"
+#include <fmt/format.h>
+
+inline void FormatCompareAndSwap(std::ostream& out)
+{
+  out << R"XXX(
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch(_Fn& __cuda_cas, int __success_memorder, int __failure_memorder, _Sco) {
+  bool __res = false;
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_ACQ_REL: __res = __cuda_cas(__atomic_cuda_acq_rel{}); break;
+        case __ATOMIC_RELEASE: __res = __cuda_cas(__atomic_cuda_release{}); break;
+        case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
+        case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __res = __cuda_cas(__atomic_cuda_volatile{}); break;
+        case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
+    )
+  )
+  return __res;
+}
+)XXX";
+
+  // Argument ID Reference
+  // 0 - Operand Type
+  // 1 - Operand Size
+  // 2 - Type Constraint
+  // 3 - Memory Order
+  // 4 - Memory Order function tag
+  // 5 - Scope Constraint
+  // 6 - Scope function tag
+  const std::string asm_intrinsic_format_128 = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, {4}, __atomic_cuda_operand_{0}{1}, {6})
+{{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {{%0, %1}}, _d;
+mov.b128 {{%4, %5}}, _v;
+atom.cas{3}{5}.b128 _d,[%2],_d,_v;
+mov.b128 _d, {{%0, %1}};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }})XXX";
+
+  const std::string asm_intrinsic_format = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, {4}, __atomic_cuda_operand_{0}{1}, {6})
+{{ asm volatile("atom.cas{3}{5}.{0}{1} %0,[%1],%2,%3;" : "={2}"(__dst) : "l"(__ptr), "{2}"(__cmp), "{2}"(__op) : "memory"); return __dst == __cmp; }})XXX";
+
+  constexpr Operand supported_types[] = {
+    Operand::Bit,
+  };
+
+  constexpr size_t supported_sizes[] = {
+    16,
+    32,
+    64,
+    128,
+  };
+
+  constexpr Semantic supported_semantics[] = {
+    Semantic::Acquire,
+    Semantic::Relaxed,
+    Semantic::Release,
+    Semantic::Acq_Rel,
+    Semantic::Volatile,
+  };
+
+  constexpr Scope supported_scopes[] = {
+    Scope::CTA,
+    Scope::Cluster,
+    Scope::GPU,
+    Scope::System,
+  };
+
+  for (auto size : supported_sizes)
+  {
+    for (auto type : supported_types)
+    {
+      for (auto sem : supported_semantics)
+      {
+        for (auto sco : supported_scopes)
+        {
+          if (size == 2 && type != Operand::Bit)
+          {
+            continue;
+          }
+          if (size == 128 && type != Operand::Bit)
+          {
+            continue;
+          }
+          out << fmt::format(
+            (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format,
+            operand(type),
+            size,
+            constraints(type, size),
+            semantic(sem),
+            semantic_tag(sem),
+            scope(sco),
+            scope_tag(sco));
+        }
+      }
+    }
+  }
+
+  out << "\n"
+      << R"XXX(
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_compare_exchange {
+  _Type* __ptr;
+  _Type* __exp;
+  _Type* __des;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE bool operator()(_Atomic_Memorder) {
+    return __cuda_atomic_compare_exchange(__ptr, *__exp, *__exp, *__des, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  __proxy_t* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
+  __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy};
+  return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type volatile* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  __proxy_t* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
+  __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy};
+  return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{});
+}
+)XXX";
+}
+
+#endif // COMPARED_AND_SWAP_H
diff --git a/libcudacxx/codegen/generators/definitions.h b/libcudacxx/codegen/generators/definitions.h
new file mode 100644
index 0000000000..0944a7e78a
--- /dev/null
+++ b/libcudacxx/codegen/generators/definitions.h
@@ -0,0 +1,193 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DEFINITIONS_H
+#define DEFINITIONS_H
+
+#include <map>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <fmt/format.h>
+
+enum class Mmio
+{
+  Disabled,
+  Enabled,
+};
+
+inline std::string mmio(Mmio m)
+{
+  static const char* mmio_map[]{
+    "",
+    ".mmio",
+  };
+  return mmio_map[std::underlying_type_t<Mmio>(m)];
+}
+
+inline std::string mmio_tag(Mmio m)
+{
+  static const char* mmio_map[]{
+    "__atomic_cuda_mmio_disable",
+    "__atomic_cuda_mmio_enable",
+  };
+  return mmio_map[std::underlying_type_t<Mmio>(m)];
+}
+
+enum class Operand
+{
+  Floating,
+  Unsigned,
+  Signed,
+  Bit,
+};
+
+inline std::string operand(Operand op)
+{
+  static std::map op_map = {
+    std::pair{Operand::Floating, "f"},
+    std::pair{Operand::Unsigned, "u"},
+    std::pair{Operand::Signed, "s"},
+    std::pair{Operand::Bit, "b"},
+  };
+  return op_map[op];
+}
+
+inline std::string operand_proxy_type(Operand op, size_t sz)
+{
+  if (op == Operand::Floating)
+  {
+    if (sz == 32)
+    {
+      return {"float"};
+    }
+    else
+    {
+      return {"double"};
+    }
+  }
+  else if (op == Operand::Signed)
+  {
+    return fmt::format("int{}_t", sz);
+  }
+  // Binary and unsigned can be the same proxy_type
+  return fmt::format("uint{}_t", sz);
+}
+
+inline std::string constraints(Operand op, size_t sz)
+{
+  static std::map constraint_map = {
+    std::pair{32,
+              std::map{
+                std::pair{Operand::Bit, "r"},
+                std::pair{Operand::Unsigned, "r"},
+                std::pair{Operand::Signed, "r"},
+                std::pair{Operand::Floating, "f"},
+              }},
+    std::pair{64,
+              std::map{
+                std::pair{Operand::Bit, "l"},
+                std::pair{Operand::Unsigned, "l"},
+                std::pair{Operand::Signed, "l"},
+                std::pair{Operand::Floating, "d"},
+              }},
+    std::pair{128,
+              std::map{
+                std::pair{Operand::Bit, "l"},
+                std::pair{Operand::Unsigned, "l"},
+                std::pair{Operand::Signed, "l"},
+                std::pair{Operand::Floating, "d"},
+              }},
+  };
+
+  if (sz == 16)
+  {
+    return {"h"};
+  }
+  else
+  {
+    return constraint_map[sz][op];
+  }
+}
+
+enum class Semantic
+{
+  Relaxed,
+  Release,
+  Acquire,
+  Acq_Rel,
+  Seq_Cst,
+  Volatile,
+};
+
+inline std::string semantic(Semantic sem)
+{
+  static std::map sem_map = {
+    std::pair{Semantic::Relaxed, ".relaxed"},
+    std::pair{Semantic::Release, ".release"},
+    std::pair{Semantic::Acquire, ".acquire"},
+    std::pair{Semantic::Acq_Rel, ".acq_rel"},
+    std::pair{Semantic::Seq_Cst, ".sc"},
+    std::pair{Semantic::Volatile, ""},
+  };
+  return sem_map[sem];
+}
+
+inline std::string semantic_tag(Semantic sem)
+{
+  static std::map sem_map = {
+    std::pair{Semantic::Relaxed, "__atomic_cuda_relaxed"},
+    std::pair{Semantic::Release, "__atomic_cuda_release"},
+    std::pair{Semantic::Acquire, "__atomic_cuda_acquire"},
+    std::pair{Semantic::Acq_Rel, "__atomic_cuda_acq_rel"},
+    std::pair{Semantic::Seq_Cst, "__atomic_cuda_seq_cst"},
+    std::pair{Semantic::Volatile, "__atomic_cuda_volatile"},
+  };
+  return sem_map[sem];
+}
+
+enum class Scope
+{
+  Thread,
+  Warp,
+  CTA,
+  Cluster,
+  GPU,
+  System,
+};
+
+inline std::string scope(Scope sco)
+{
+  static std::map sco_map = {
+    std::pair{Scope::Thread, ""},
+    std::pair{Scope::Warp, ""},
+    std::pair{Scope::CTA, ".cta"},
+    std::pair{Scope::Cluster, ".cluster"},
+    std::pair{Scope::GPU, ".gpu"},
+    std::pair{Scope::System, ".sys"},
+  };
+  return sco_map[sco];
+}
+
+inline std::string scope_tag(Scope sco)
+{
+  static std::map sco_map = {
+    std::pair{Scope::Thread, "__thread_scope_thread_tag"},
+    std::pair{Scope::Warp, ""},
+    std::pair{Scope::CTA, "__thread_scope_block_tag"},
+    std::pair{Scope::Cluster, "__thread_scope_cluster_tag"},
+    std::pair{Scope::GPU, "__thread_scope_device_tag"},
+    std::pair{Scope::System, "__thread_scope_system_tag"},
+  };
+  return sco_map[sco];
+}
+
+#endif // DEFINITIONS_H
diff --git a/libcudacxx/codegen/generators/exchange.h b/libcudacxx/codegen/generators/exchange.h
new file mode 100644
index 0000000000..dcfe66f147
--- /dev/null
+++ b/libcudacxx/codegen/generators/exchange.h
@@ -0,0 +1,173 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef EXCHANGE_H
+#define EXCHANGE_H
+
+#include <string>
+
+#include "definitions.h"
+#include <fmt/format.h>
+
+inline void FormatExchange(std::ostream& out)
+{
+  out << R"XXX(
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn& __cuda_exch, int __memorder, _Sco) {
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_ACQ_REL: __cuda_exch(__atomic_cuda_acq_rel{}); break;
+        case __ATOMIC_RELEASE: __cuda_exch(__atomic_cuda_release{}); break;
+        case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_exch(__atomic_cuda_volatile{}); break;
+        case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
+    )
+  )
+}
+)XXX";
+
+  // Argument ID Reference
+  // 0 - Operand Type
+  // 1 - Operand Size
+  // 2 - Type Constraint
+  // 3 - Memory Order
+  // 4 - Memory Order function tag
+  // 5 - Scope Constraint
+  // 6 - Scope function tag
+  const std::string asm_intrinsic_format_128 = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, {4}, __atomic_cuda_operand_{0}{1}, {6})
+{{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {{%3, %4}}, _v;
+    atom.exch{3}{5}.b128 _d,[%2],_v;
+    mov.b128 _d, {{%0, %1}};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}})XXX";
+
+  const std::string asm_intrinsic_format = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, {4}, __atomic_cuda_operand_{0}{1}, {6})
+{{ asm volatile("atom.exch{3}{5}.{0}{1} %0,[%1],%2;" : "={2}"(__old) : "l"(__ptr), "{2}"(__new) : "memory"); }})XXX";
+
+  constexpr Operand supported_types[] = {
+    Operand::Bit,
+  };
+
+  constexpr size_t supported_sizes[] = {
+    16,
+    32,
+    64,
+    128,
+  };
+
+  constexpr Semantic supported_semantics[] = {
+    Semantic::Acquire,
+    Semantic::Relaxed,
+    Semantic::Release,
+    Semantic::Acq_Rel,
+    Semantic::Volatile,
+  };
+
+  constexpr Scope supported_scopes[] = {
+    Scope::CTA,
+    Scope::Cluster,
+    Scope::GPU,
+    Scope::System,
+  };
+
+  for (auto size : supported_sizes)
+  {
+    for (auto type : supported_types)
+    {
+      for (auto sem : supported_semantics)
+      {
+        for (auto sco : supported_scopes)
+        {
+          if (size == 2 && type != Operand::Bit)
+          {
+            continue;
+          }
+          if (size == 128 && type != Operand::Bit)
+          {
+            continue;
+          }
+          out << fmt::format(
+            (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format,
+            operand(type),
+            size,
+            constraints(type, size),
+            semantic(sem),
+            semantic_tag(sem),
+            scope(sco),
+            scope_tag(sco));
+        }
+      }
+    }
+  }
+
+  out << "\n"
+      << R"XXX(
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_exchange {
+  _Type* __ptr;
+  _Type* __old;
+  _Type* __new;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_exchange(__ptr, *__old, *__new, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __old, _Type __new, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  __proxy_t* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
+  __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy};
+  __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type volatile* __ptr, _Type& __old, _Type __new, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  __proxy_t* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
+  __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy};
+  __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{});
+}
+)XXX";
+}
+
+#endif // EXCHANGE_H
diff --git a/libcudacxx/codegen/generators/fence.h b/libcudacxx/codegen/generators/fence.h
new file mode 100644
index 0000000000..073264b7e3
--- /dev/null
+++ b/libcudacxx/codegen/generators/fence.h
@@ -0,0 +1,110 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FENCE_H
+#define FENCE_H
+
+#include <string>
+
+#include "definitions.h"
+#include <fmt/format.h>
+
+inline std::string membar_scope(Scope sco)
+{
+  static std::map scope_map{
+    std::pair{Scope::GPU, ".gl"},
+    std::pair{Scope::System, ".sys"},
+    std::pair{Scope::CTA, ".cta"},
+  };
+
+  return scope_map[sco];
+}
+
+inline void FormatFence(std::ostream& out)
+{
+  // Argument ID Reference
+  // 0 - Membar scope tag
+  // 1 - Membar scope
+  const std::string intrinsic_membar = R"XXX(
+static inline _CCCL_DEVICE void __cuda_atomic_membar({0})
+{{ asm volatile("membar{1};" ::: "memory"); }})XXX";
+
+  const std::map membar_scopes{
+    std::pair{Scope::GPU, ".gl"},
+    std::pair{Scope::System, ".sys"},
+    std::pair{Scope::CTA, ".cta"},
+  };
+
+  for (const auto& sco : membar_scopes)
+  {
+    out << fmt::format(intrinsic_membar, scope_tag(sco.first), sco.second);
+  }
+
+  // Argument ID Reference
+  // 0 - Fence scope tag
+  // 1 - Fence scope
+  // 2 - Fence order tag
+  // 3 - Fence order
+  const std::string intrinsic_fence = R"XXX(
+static inline _CCCL_DEVICE void __cuda_atomic_fence({0}, {2})
+{{ asm volatile("fence{1}{3};" ::: "memory"); }})XXX";
+
+  const Scope fence_scopes[] = {
+    Scope::CTA,
+    Scope::Cluster,
+    Scope::GPU,
+    Scope::System,
+  };
+
+  const Semantic fence_semantics[] = {
+    Semantic::Acq_Rel,
+    Semantic::Seq_Cst,
+  };
+
+  for (const auto& sco : fence_scopes)
+  {
+    for (const auto& sem : fence_semantics)
+    {
+      out << fmt::format(intrinsic_fence, scope_tag(sco), scope(sco), semantic_tag(sem), semantic(sem));
+    }
+  }
+  out << "\n"
+      << R"XXX(
+template <typename _Sco>
+static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, _Sco) {
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); break;
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
+        case __ATOMIC_RELEASE: __cuda_atomic_fence(_Sco{}, __atomic_cuda_acq_rel{}); break;
+        case __ATOMIC_RELAXED: break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELAXED: break;
+        default: assert(0);
+      }
+    )
+  )
+}
+)XXX";
+}
+
+#endif // FENCE_H
diff --git a/libcudacxx/codegen/generators/fetch_ops.h b/libcudacxx/codegen/generators/fetch_ops.h
new file mode 100644
index 0000000000..8ce48b5e78
--- /dev/null
+++ b/libcudacxx/codegen/generators/fetch_ops.h
@@ -0,0 +1,217 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FETCH_OPS_H
+#define FETCH_OPS_H
+
+#include <array>
+#include <string>
+
+#include "definitions.h"
+#include <fmt/format.h>
+
+inline std::string fetch_op_skip_v(std::string fetch_op)
+{
+  if (fetch_op == "add")
+  {
+    return "constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;";
+  }
+  return "constexpr auto __skip_v = 1;";
+}
+
+inline void FormatFetchOps(std::ostream& out)
+{
+  const std::vector arithmetic_types = {
+    Operand::Floating,
+    Operand::Unsigned,
+    Operand::Signed,
+  };
+
+  const std::vector minmax_types = {
+    Operand::Unsigned,
+    Operand::Signed,
+  };
+
+  const std::vector bitwise_types = {Operand::Bit};
+
+  const std::map op_support_map{
+    std::pair{std::string{"add"}, std::pair{arithmetic_types, std::string{"arithmetic"}}},
+    std::pair{std::string{"min"}, std::pair{minmax_types, std::string{"minmax"}}},
+    std::pair{std::string{"max"}, std::pair{minmax_types, std::string{"minmax"}}},
+    std::pair{std::string{"or"}, std::pair{bitwise_types, std::string{"bitwise"}}},
+    std::pair{std::string{"xor"}, std::pair{bitwise_types, std::string{"bitwise"}}},
+    std::pair{std::string{"and"}, std::pair{bitwise_types, std::string{"bitwise"}}},
+  };
+
+  // Memory order dispatcher
+  out << R"XXX(
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_memory_order_dispatch(_Fn& __cuda_fetch, int __memorder, _Sco) {
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_ACQ_REL: __cuda_fetch(__atomic_cuda_acq_rel{}); break;
+        case __ATOMIC_RELEASE: __cuda_fetch(__atomic_cuda_release{}); break;
+        case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_fetch(__atomic_cuda_volatile{}); break;
+        case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
+    )
+  )
+}
+)XXX";
+
+  // Argument ID Reference
+  // 0 - Atomic Operation
+  // 1 - Operand Type
+  // 2 - Operand Size
+  // 3 - Type Constraint
+  // 4 - Memory Order
+  // 5 - Memory Order function tag
+  // 6 - Scope Constraint
+  // 7 - Scope function tag
+  const std::string asm_intrinsic_format = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_{0}(
+  _Type* __ptr, _Type& __dst, _Type __op, {5}, __atomic_cuda_operand_{1}{2}, {7})
+{{ asm volatile("atom.{0}{4}{6}.{1}{2} %0,[%1],%2;" : "={3}"(__dst) : "l"(__ptr), "{3}"(__op) : "memory"); }})XXX";
+
+  // 0 - Atomic Operation
+  // 1 - Operand type constraint
+  // 2 - Pointer op skip_v
+  const std::string fetch_bind_invoke = R"XXX(
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_fetch_{0} {{
+  _Type* __ptr;
+  _Type* __dst;
+  _Type* __op;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {{
+    __cuda_atomic_fetch_{0}(__ptr, *__dst, *__op, _Atomic_Memorder{{}}, _Tag{{}}, _Sco{{}});
+  }}
+}};
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_{1}<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_{0}_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{{
+  {2}
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_{1}<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_{1}<_Type>::__tag;
+  _Type __dst{{}};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_{0}<__proxy_t, __proxy_tag, _Sco> __bound_{0}{{__ptr_proxy, __dst_proxy, __op_proxy}};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_{0}, __memorder, _Sco{{}});
+  return __dst;
+}}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_{1}<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_{0}_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{{
+  {2}
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_{1}<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_{1}<_Type>::__tag;
+  _Type __dst{{}};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_{0}<__proxy_t, __proxy_tag, _Sco> __bound_{0}{{__ptr_proxy, __dst_proxy, __op_proxy}};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_{0}, __memorder, _Sco{{}});
+  return __dst;
+}}
+)XXX";
+
+  constexpr size_t supported_sizes[] = {
+    32,
+    64,
+  };
+
+  constexpr Semantic supported_semantics[] = {
+    Semantic::Acquire,
+    Semantic::Relaxed,
+    Semantic::Release,
+    Semantic::Acq_Rel,
+    Semantic::Volatile,
+  };
+
+  constexpr Scope supported_scopes[] = {
+    Scope::CTA,
+    Scope::Cluster,
+    Scope::GPU,
+    Scope::System,
+  };
+
+  for (auto& op_kp : op_support_map)
+  {
+    const auto& op_name    = op_kp.first;
+    const auto& op_type_kp = op_kp.second;
+    const auto& type_list  = op_type_kp.first;
+    const auto& deduction  = op_type_kp.second;
+    for (auto type : type_list)
+    {
+      for (auto size : supported_sizes)
+      {
+        const std::string proxy_type = operand_proxy_type(type, size);
+        for (auto sco : supported_scopes)
+        {
+          for (auto sem : supported_semantics)
+          {
+            // There is no atom.add.s64
+            if (op_name == "add" && type == Operand::Signed && size == 64)
+            {
+              continue;
+            }
+            out << fmt::format(
+              asm_intrinsic_format,
+              /* 0 */ op_name,
+              /* 1 */ operand(type),
+              /* 2 */ size,
+              /* 3 */ constraints(type, size),
+              /* 4 */ semantic(sem),
+              /* 5 */ semantic_tag(sem),
+              /* 6 */ scope(sco),
+              /* 7 */ scope_tag(sco));
+          }
+        }
+      }
+    }
+    out << "\n" << fmt::format(fetch_bind_invoke, op_name, deduction, fetch_op_skip_v(op_name));
+  }
+
+  out << R"XXX(
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
+}
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
+}
+)XXX";
+}
+
+#endif // FETCH_OPS_H
diff --git a/libcudacxx/codegen/generators/header.h b/libcudacxx/codegen/generators/header.h
new file mode 100644
index 0000000000..39a848314b
--- /dev/null
+++ b/libcudacxx/codegen/generators/header.h
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEADER_H
+#define HEADER_H
+
+#include <string>
+
+inline void FormatHeader(std::ostream& out)
+{
+  std::string header = R"XXX(
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
+// clang-format off
+
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/functions/common.h>
+#include <cuda/std/__atomic/functions/cuda_ptx_generated_helper.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_CCCL_CUDA_COMPILER)
+)XXX";
+
+  out << header;
+}
+
+inline void FormatTail(std::ostream& out)
+{
+  std::string tail = R"XXX(
+#endif // defined(_CCCL_CUDA_COMPILER)
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
+// clang-format on
+)XXX";
+
+  out << tail;
+}
+
+#endif // HEADER_H
diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h
new file mode 100644
index 0000000000..d4aec3da54
--- /dev/null
+++ b/libcudacxx/codegen/generators/ld_st.h
@@ -0,0 +1,353 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LD_ST_H
+#define LD_ST_H
+
+#include <string>
+
+#include "definitions.h"
+#include <fmt/format.h>
+
+inline std::string semantic_ld_st(Semantic sem)
+{
+  static std::map sem_map = {
+    std::pair{Semantic::Relaxed, ".relaxed"},
+    std::pair{Semantic::Release, ".release"},
+    std::pair{Semantic::Acquire, ".acquire"},
+    std::pair{Semantic::Volatile, ".volatile"},
+  };
+  return sem_map[sem];
+}
+
+inline std::string scope_ld_st(Semantic sem, Scope sco)
+{
+  if (sem == Semantic::Volatile)
+  {
+    return "";
+  }
+  return scope(sco);
+}
+
+inline void FormatLoad(std::ostream& out)
+{
+  out << R"XXX(
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__cuda_load, int __memorder, _Sco) {
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
+    )
+  )
+}
+)XXX";
+
+  // Argument ID Reference
+  // 0 - Operand Type
+  // 1 - Operand Size
+  // 2 - Constraint
+  // 3 - Memory order
+  // 4 - Memory order semantic
+  // 5 - Scope tag
+  // 6 - Scope semantic
+  // 7 - Mmio tag
+  // 8 - Mmio semantic
+  const std::string asm_intrinsic_format_128 = R"XXX(
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7})
+{{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld{8}{4}{6}.b128 [%2],_d;
+    mov.b128 _d, {{%0, %1}};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}})XXX";
+  const std::string asm_intrinsic_format     = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7})
+{{ asm volatile("ld{8}{4}{6}.{0}{1} %0,[%1];" : "={2}"(__dst) : "l"(__ptr) : "memory"); }})XXX";
+
+  constexpr size_t supported_sizes[] = {
+    16,
+    32,
+    64,
+    128,
+  };
+
+  constexpr Operand supported_types[] = {
+    Operand::Bit,
+    Operand::Floating,
+    Operand::Unsigned,
+    Operand::Signed,
+  };
+
+  constexpr Semantic supported_semantics[] = {
+    Semantic::Acquire,
+    Semantic::Relaxed,
+    Semantic::Volatile,
+  };
+
+  constexpr Scope supported_scopes[] = {
+    Scope::CTA,
+    Scope::Cluster,
+    Scope::GPU,
+    Scope::System,
+  };
+
+  constexpr Mmio mmio_states[] = {
+    Mmio::Disabled,
+    Mmio::Enabled,
+  };
+
+  for (auto size : supported_sizes)
+  {
+    for (auto type : supported_types)
+    {
+      for (auto sem : supported_semantics)
+      {
+        for (auto sco : supported_scopes)
+        {
+          for (auto mm : mmio_states)
+          {
+            if (size == 16 && type == Operand::Floating)
+            {
+              continue;
+            }
+            if (size == 128 && type != Operand::Bit)
+            {
+              continue;
+            }
+            if ((mm == Mmio::Enabled) && ((sco != Scope::System) || (sem != Semantic::Relaxed)))
+            {
+              continue;
+            }
+            out << fmt::format(
+              (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format,
+              /* 0 */ operand(type),
+              /* 1 */ size,
+              /* 2 */ constraints(type, size),
+              /* 3 */ semantic_tag(sem),
+              /* 4 */ semantic_ld_st(sem),
+              /* 5 */ scope_tag(sco),
+              /* 6 */ scope_ld_st(sem, sco),
+              /* 7 */ mmio_tag(mm),
+              /* 8 */ mmio(mm));
+          }
+        }
+      }
+    }
+  }
+  out << "\n"
+      << R"XXX(
+template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
+struct __cuda_atomic_bind_load {
+  const _Type* __ptr;
+  _Type* __dst;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_load(__ptr, *__dst, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __dst, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy};
+  __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr, _Type& __dst, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy};
+  __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{});
+}
+)XXX";
+}
+
+inline void FormatStore(std::ostream& out)
+{
+  out << R"XXX(
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &__cuda_store, int __memorder, _Sco) {
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__memorder) {
+        case __ATOMIC_RELEASE: __cuda_store(__atomic_cuda_release{}); break;
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__memorder) {
+        case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
+        case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
+    )
+  )
+}
+)XXX";
+  // Argument ID Reference
+  // 0 - Operand Type
+  // 1 - Operand Size
+  // 2 - Constraint
+  // 3 - Memory order
+  // 4 - Memory order semantic
+  // 5 - Scope tag
+  // 6 - Scope semantic
+  // 7 - Mmio tag
+  // 8 - Mmio semantic
+  const std::string asm_intrinsic_format_128 = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7})
+{{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {{%1, %2}}, _v;
+    st{8}{4}{6}.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}})XXX";
+  const std::string asm_intrinsic_format     = R"XXX(
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, {3}, __atomic_cuda_operand_{0}{1}, {5}, {7})
+{{ asm volatile("st{8}{4}{6}.{0}{1} [%0],%1;" :: "l"(__ptr), "{2}"(__val) : "memory"); }})XXX";
+
+  constexpr size_t supported_sizes[] = {
+    16,
+    32,
+    64,
+    128,
+  };
+
+  constexpr Operand supported_types[] = {
+    Operand::Bit,
+  };
+
+  constexpr Semantic supported_semantics[] = {
+    Semantic::Release,
+    Semantic::Relaxed,
+    Semantic::Volatile,
+  };
+
+  constexpr Scope supported_scopes[] = {
+    Scope::CTA,
+    Scope::Cluster,
+    Scope::GPU,
+    Scope::System,
+  };
+
+  constexpr Mmio mmio_states[] = {
+    Mmio::Disabled,
+    Mmio::Enabled,
+  };
+
+  for (auto size : supported_sizes)
+  {
+    for (auto type : supported_types)
+    {
+      for (auto sem : supported_semantics)
+      {
+        for (auto sco : supported_scopes)
+        {
+          for (auto mm : mmio_states)
+          {
+            if (size == 16 && type == Operand::Floating)
+            {
+              continue;
+            }
+            if (size == 128 && type != Operand::Bit)
+            {
+              continue;
+            }
+            if ((mm == Mmio::Enabled) && ((sco != Scope::System) || (sem != Semantic::Relaxed)))
+            {
+              continue;
+            }
+            out << fmt::format(
+              (size == 128) ? asm_intrinsic_format_128 : asm_intrinsic_format,
+              /* 0 */ operand(type),
+              /* 1 */ size,
+              /* 2 */ constraints(type, size),
+              /* 3 */ semantic_tag(sem),
+              /* 4 */ semantic_ld_st(sem),
+              /* 5 */ scope_tag(sco),
+              /* 6 */ scope_ld_st(sem, sco),
+              /* 7 */ mmio_tag(mm),
+              /* 8 */ mmio(mm));
+          }
+        }
+      }
+    }
+  }
+  out << "\n"
+      << R"XXX(
+template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
+struct __cuda_atomic_bind_store {
+  _Type* __ptr;
+  _Type* __val;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_store(__ptr, *__val, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy};
+  __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy};
+  __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
+}
+)XXX";
+}
+
+#endif // LD_ST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/common.h b/libcudacxx/include/cuda/std/__atomic/functions/common.h
new file mode 100644
index 0000000000..415c59a9be
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/functions/common.h
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_COMMON_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_COMMON_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/remove_cvref.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp>
+struct __atomic_ptr_skip
+{
+  static constexpr auto __skip = 1;
+};
+
+template <typename _Tp>
+struct __atomic_ptr_skip<_Tp*>
+{
+  static constexpr auto __skip = sizeof(_Tp);
+};
+
+// FIXME: Haven't figured out what the spec says about using arrays with
+// atomic_fetch_add. Force a failure rather than creating bad behavior.
+template <typename _Tp>
+struct __atomic_ptr_skip<_Tp[]>
+{};
+template <typename _Tp, int n>
+struct __atomic_ptr_skip<_Tp[n]>
+{};
+
+template <typename _Tp>
+using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 1d791ca42e..0e525bf296 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -13,6 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include "cuda_ptx_generated.h"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -31,232 +33,372 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_CCCL_CUDA_COMPILER)
 
-template <typename _Tp,
-          typename _Sco,
-          __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-  void volatile* __ptr,
-  _Tp* __expected,
-  const _Tp __desired,
-  bool __weak,
-  int __success_memorder,
-  int __failure_memorder,
-  _Sco)
-{
-  using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-  __proxy_t __old = 0;
-  __proxy_t __new = 0;
-  memcpy(&__old, __expected, sizeof(__proxy_t));
-  memcpy(&__new, &__desired, sizeof(__proxy_t));
-  bool __result =
-    __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
-  memcpy(__expected, &__old, sizeof(__proxy_t));
-  return __result;
-}
-template <typename _Tp,
-          typename _Sco,
-          __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-  void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
-{
-  using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-  __proxy_t __old = 0;
-  __proxy_t __new = 0;
-  memcpy(&__old, __expected, sizeof(__proxy_t));
-  memcpy(&__new, &__desired, sizeof(__proxy_t));
-  bool __result =
-    __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
-  memcpy(__expected, &__old, sizeof(__proxy_t));
-  return __result;
-}
-template <typename _Tp,
-          typename _Sco,
-          __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-{
-  using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-  __proxy_t __old = 0;
-  __proxy_t __new = 0;
-  memcpy(&__new, __val, sizeof(__proxy_t));
-  __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
-  memcpy(__ret, &__old, sizeof(__proxy_t));
-}
-template <typename _Tp,
-          typename _Sco,
-          __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-{
-  using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-  __proxy_t __old = 0;
-  __proxy_t __new = 0;
-  memcpy(&__new, __val, sizeof(__proxy_t));
-  __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
-  memcpy(__ret, &__old, sizeof(__proxy_t));
-}
-
-template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-  _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, _Sco)
-{
-  auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-  auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
-  auto const __mask    = ((1 << sizeof(_Tp) * 8) - 1) << __offset;
-
-  uint32_t __old = *__expected << __offset;
-  uint32_t __old_value;
-  while (1)
-  {
-    __old_value = (__old & __mask) >> __offset;
-    if (__old_value != *__expected)
-    {
-      break;
-    }
-    uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
-    if (__atomic_compare_exchange_cuda(
-          __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
-    {
-      return true;
-    }
-  }
-  *__expected = __old_value;
-  return false;
-}
-
-template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-{
-  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{}))
-    ;
-  *__ret = __expected;
-}
-
-template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+template <typename _Tp, typename _Fn, typename _Sco>
+_CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  _Tp __desired  = __expected + __val;
+  _Tp __desired  = __op(__expected);
   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
-    __desired = __expected + __val;
+    __desired = __op(__expected);
   }
   return __expected;
 }
-
-template <typename _Tp,
-          typename _Up,
-          typename _Sco,
-          __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+template <typename _Tp, typename _Fn, typename _Sco>
+_CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp volatile* __ptr, const _Fn& __op, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  _Tp __desired  = __expected > __val ? __expected : __val;
-
-  while (__desired == __val
-         && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  _Tp __desired  = __op(__expected);
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
-    __desired = __expected > __val ? __expected : __val;
+    __desired = __op(__expected);
   }
-
   return __expected;
 }
 
-template <typename _Tp,
-          typename _Up,
-          typename _Sco,
-          __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp* __ptr, int __memorder, _Sco)
 {
-  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  _Tp __desired  = __expected < __val ? __expected : __val;
-
-  while (__desired == __val
-         && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-  {
-    __desired = __expected < __val ? __expected : __val;
-  }
-
-  return __expected;
+  _Tp __ret;
+  __atomic_load_cuda(__ptr, __ret, __memorder, _Sco{});
+  return __ret;
 }
-
-template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
 {
-  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  _Tp __desired  = __expected - __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-  {
-    __desired = __expected - __val;
-  }
-  return __expected;
+  _Tp __ret;
+  __atomic_load_cuda(__ptr, __ret, __memorder, _Sco{});
+  return __ret;
 }
 
-template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE void __atomic_store_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
 {
-  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  _Tp __desired  = __expected & __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-  {
-    __desired = __expected & __val;
-  }
-  return __expected;
+  __atomic_store_cuda(__ptr, __val, __memorder, _Sco{});
 }
-
-template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
 {
-  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  _Tp __desired  = __expected ^ __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-  {
-    __desired = __expected ^ __val;
-  }
-  return __expected;
+  __atomic_store_cuda(__ptr, __val, __memorder, _Sco{});
 }
 
-template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
 {
-  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-  _Tp __desired  = __expected | __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-  {
-    __desired = __expected | __val;
-  }
-  return __expected;
+  _Tp __ret;
+  __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
+  return __ret;
 }
-
 template <typename _Tp, typename _Sco>
-_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
 {
   _Tp __ret;
-  __atomic_load_cuda(__ptr, &__ret, __memorder, _Sco{});
+  __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
   return __ret;
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
+template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
+_CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
-  __atomic_store_cuda(__ptr, &__val, __memorder, _Sco{});
+  return __atomic_fetch_update_cuda(
+    __ptr,
+    [__val](_Tp __old) {
+      return __old < __val ? __old : __val;
+    },
+    __memorder,
+    _Sco{});
 }
-
-template <typename _Tp, typename _Sco>
-_CCCL_DEVICE bool __atomic_compare_exchange_n_cuda(
-  _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
+template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
+_CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
-  return __atomic_compare_exchange_cuda(
-    __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{});
+  return __atomic_fetch_update_cuda(
+    __ptr,
+    [__val](_Tp __old) {
+      return __old < __val ? __old : __val;
+    },
+    __memorder,
+    _Sco{});
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
+template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
+_CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Tp __ret;
-  __atomic_exchange_cuda(__ptr, &__val, &__ret, __memorder, _Sco{});
-  return __ret;
+  return __atomic_fetch_update_cuda(
+    __ptr,
+    [__val](_Tp __old) {
+      return __old > __val ? __old : __val;
+    },
+    __memorder,
+    _Sco{});
+}
+template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
+_CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
+{
+  return __atomic_fetch_update_cuda(
+    __ptr,
+    [__val](_Tp __old) {
+      return __old > __val ? __old : __val;
+    },
+    __memorder,
+    _Sco{});
 }
 
+// template <typename _Tp,
+//           typename _Sco,
+//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
+// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
+//   void volatile* __ptr,
+//   _Tp* __expected,
+//   const _Tp __desired,
+//   bool __weak,
+//   int __success_memorder,
+//   int __failure_memorder,
+//   _Sco)
+// {
+//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
+//   __proxy_t __old = 0;
+//   __proxy_t __new = 0;
+//   memcpy(&__old, __expected, sizeof(__proxy_t));
+//   memcpy(&__new, &__desired, sizeof(__proxy_t));
+//   bool __result =
+//     __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
+//   memcpy(__expected, &__old, sizeof(__proxy_t));
+//   return __result;
+// }
+// template <typename _Tp,
+//           typename _Sco,
+//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
+// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
+//   void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder,
+//   _Sco)
+// {
+//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
+//   __proxy_t __old = 0;
+//   __proxy_t __new = 0;
+//   memcpy(&__old, __expected, sizeof(__proxy_t));
+//   memcpy(&__new, &__desired, sizeof(__proxy_t));
+//   bool __result =
+//     __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
+//   memcpy(__expected, &__old, sizeof(__proxy_t));
+//   return __result;
+// }
+// template <typename _Tp,
+//           typename _Sco,
+//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
+// _CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
+// {
+//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
+//   __proxy_t __old = 0;
+//   __proxy_t __new = 0;
+//   memcpy(&__new, __val, sizeof(__proxy_t));
+//   __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
+//   memcpy(__ret, &__old, sizeof(__proxy_t));
+// }
+// template <typename _Tp,
+//           typename _Sco,
+//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
+// _CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
+// {
+//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
+//   __proxy_t __old = 0;
+//   __proxy_t __new = 0;
+//   memcpy(&__new, __val, sizeof(__proxy_t));
+//   __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
+//   memcpy(__ret, &__old, sizeof(__proxy_t));
+// }
+
+// template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
+//   _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder,
+//   _Sco)
+// {
+//   auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
+//   auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
+//   auto const __mask    = ((1 << sizeof(_Tp) * 8) - 1) << __offset;
+
+//   uint32_t __old = *__expected << __offset;
+//   uint32_t __old_value;
+//   while (1)
+//   {
+//     __old_value = (__old & __mask) >> __offset;
+//     if (__old_value != *__expected)
+//     {
+//       break;
+//     }
+//     uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
+//     if (__atomic_compare_exchange_cuda(
+//           __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
+//     {
+//       return true;
+//     }
+//   }
+//   *__expected = __old_value;
+//   return false;
+// }
+
+// template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+// _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{}))
+//     ;
+//   *__ret = __expected;
+// }
+
+// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected + __val;
+//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected + __val;
+//   }
+//   return __expected;
+// }
+
+// template <typename _Tp,
+//           typename _Up,
+//           typename _Sco,
+//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected > __val ? __expected : __val;
+
+//   while (__desired == __val
+//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected > __val ? __expected : __val;
+//   }
+
+//   return __expected;
+// }
+// template <typename _Tp,
+//           typename _Up,
+//           typename _Sco,
+//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected > __val ? __expected : __val;
+
+//   while (__desired == __val
+//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected > __val ? __expected : __val;
+//   }
+
+//   return __expected;
+// }
+
+// template <typename _Tp,
+//           typename _Up,
+//           typename _Sco,
+//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected < __val ? __expected : __val;
+
+//   while (__desired == __val
+//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected < __val ? __expected : __val;
+//   }
+
+//   return __expected;
+// }
+// template <typename _Tp,
+//           typename _Up,
+//           typename _Sco,
+//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected < __val ? __expected : __val;
+
+//   while (__desired == __val
+//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected < __val ? __expected : __val;
+//   }
+
+//   return __expected;
+// }
+
+// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected - __val;
+//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected - __val;
+//   }
+//   return __expected;
+// }
+
+// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected & __val;
+//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected & __val;
+//   }
+//   return __expected;
+// }
+
+// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected ^ __val;
+//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected ^ __val;
+//   }
+//   return __expected;
+// }
+
+// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+// _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+// {
+//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+//   _Tp __desired  = __expected | __val;
+//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+//   {
+//     __desired = __expected | __val;
+//   }
+//   return __expected;
+// }
+
+// template <typename _Tp, typename _Sco>
+// _CCCL_DEVICE bool __atomic_compare_exchange_n_cuda(
+//   _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder,
+//   _Sco)
+// {
+//   return __atomic_compare_exchange_cuda(
+//     __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{});
+// }
+
+// template <typename _Tp, typename _Sco>
+// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
+// {
+//   _Tp __ret;
+//   __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
+//   return __ret;
+// }
+// template <typename _Tp, typename _Sco>
+// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
+// {
+//   _Tp __ret;
+//   __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
+//   return __ret;
+// }
+
 _CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int)
 {
   asm volatile("" ::: "memory");
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 2ebfa4ea3a..e72144b68c 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -1,10 +1,11 @@
+
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -33,23 +34,46 @@
 
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/functions/common.h>
+#include <cuda/std/__atomic/functions/cuda_ptx_generated_helper.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_CCCL_CUDA_COMPILER)
 
-static inline _CCCL_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); }
-static inline _CCCL_DEVICE void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); }
-static inline _CCCL_DEVICE void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); }
-static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thread_scope_block_tag) {
+static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_block_tag)
+{ asm volatile("membar.cta;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_device_tag)
+{ asm volatile("membar.gl;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_system_tag)
+{ asm volatile("membar.sys;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_block_tag, __atomic_cuda_acq_rel)
+{ asm volatile("fence.cta.acq_rel;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_block_tag, __atomic_cuda_seq_cst)
+{ asm volatile("fence.cta.sc;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_cluster_tag, __atomic_cuda_acq_rel)
+{ asm volatile("fence.cluster.acq_rel;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_cluster_tag, __atomic_cuda_seq_cst)
+{ asm volatile("fence.cluster.sc;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_device_tag, __atomic_cuda_acq_rel)
+{ asm volatile("fence.gpu.acq_rel;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_device_tag, __atomic_cuda_seq_cst)
+{ asm volatile("fence.gpu.sc;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_system_tag, __atomic_cuda_acq_rel)
+{ asm volatile("fence.sys.acq_rel;" ::: "memory"); }
+static inline _CCCL_DEVICE void __cuda_atomic_fence(__thread_scope_system_tag, __atomic_cuda_seq_cst)
+{ asm volatile("fence.sys.sc;" ::: "memory"); }
+
+template <typename _Sco>
+static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, _Sco) {
   NV_DISPATCH_TARGET(
     NV_PROVIDES_SM_70, (
       switch (__memorder) {
-        case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); break;
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); break;
         case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
         case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
         case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
-        case __ATOMIC_RELEASE: __cuda_fence_acq_rel_block(); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_fence(_Sco{}, __atomic_cuda_acq_rel{}); break;
         case __ATOMIC_RELAXED: break;
         default: assert(0);
       }
@@ -60,6802 +84,4015 @@ static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thr
         case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
         case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
         case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
-        case __ATOMIC_RELEASE: __cuda_membar_block(); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); break;
         case __ATOMIC_RELAXED: break;
         default: assert(0);
       }
     )
   )
 }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    uint32_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_block(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_block(__ptr, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    uint32_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_block(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_block(__ptr, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    uint64_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_block(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_block(__ptr, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    uint64_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_block(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_block(__ptr, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_relaxed_32_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.cta.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_release_32_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.cta.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_volatile_32_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) {
-    uint32_t __tmp = 0;
-    memcpy(&__tmp, __val, 4);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_32_block(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) {
-    uint32_t __tmp = 0;
-    memcpy(&__tmp, __val, 4);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_32_block(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_32_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_relaxed_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.cta.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_release_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.cta.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_volatile_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) {
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, __val, 8);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_64_block(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) {
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, __val, 8);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_64_block(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_64_block(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_block(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
+
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__cuda_load, int __memorder, _Sco) {
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__memorder) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __cuda_load(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELAXED: __cuda_load(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
     )
-    return __tmp;
+  )
 }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f32 %0,[%1];" : "=f"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.f64 %0,[%1];" : "=d"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.acquire.cta.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.acquire.cluster.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.acquire.gpu.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.acquire.sys.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.relaxed.cta.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.relaxed.cluster.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.relaxed.gpu.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.relaxed.sys.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.mmio.relaxed.sys.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.volatile.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.volatile.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.volatile.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
+}
+  template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    ld.volatile.b128 [%2],_d;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
 }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
+
+template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
+struct __cuda_atomic_bind_load {
+  const _Type* __ptr;
+  _Type* __dst;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_load(__ptr, *__dst, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __dst, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy};
+  __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr, _Type& __dst, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy};
+  __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{});
 }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_block_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
+
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &__cuda_store, int __memorder, _Sco) {
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__memorder) {
+        case __ATOMIC_RELEASE: __cuda_store(__atomic_cuda_release{}); break;
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__memorder) {
+        case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
+        case __ATOMIC_SEQ_CST: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_RELAXED: __cuda_store(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
     )
-    return __tmp;
+  )
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("st.mmio.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cta.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cluster.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.gpu.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.sys.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cta.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cluster.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.gpu.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.sys.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("st.mmio.relaxed.sys.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b32 [%0],%1;" :: "l"(__ptr), "r"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cta.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cluster.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.gpu.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.sys.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cta.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cluster.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.gpu.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.sys.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("st.mmio.relaxed.sys.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b64 [%0],%1;" :: "l"(__ptr), "l"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.release.cta.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.release.cluster.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.release.gpu.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.release.sys.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.relaxed.cta.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.relaxed.cluster.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.relaxed.gpu.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.relaxed.sys.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.mmio.relaxed.sys.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.volatile.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.volatile.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.volatile.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _v;
+    mov.b128 {%1, %2}, _v;
+    st.volatile.b128 [%0],_v;
+)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp = -__tmp;
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
+struct __cuda_atomic_bind_store {
+  _Type* __ptr;
+  _Type* __val;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_store(__ptr, *__val, _Atomic_Memorder{}, _Tag{}, _Sco{}, _Mmio{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy};
+  __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy};
+  __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
+
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch(_Fn& __cuda_cas, int __success_memorder, int __failure_memorder, _Sco) {
+  bool __res = false;
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_70, (
+      switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_ACQ_REL: __res = __cuda_cas(__atomic_cuda_acq_rel{}); break;
+        case __ATOMIC_RELEASE: __res = __cuda_cas(__atomic_cuda_release{}); break;
+        case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_relaxed{}); break;
+        default: assert(0);
+      }
+    ),
+    NV_IS_DEVICE, (
+      switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
+        case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
+        case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQUIRE: __res = __cuda_cas(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __res = __cuda_cas(__atomic_cuda_volatile{}); break;
+        case __ATOMIC_RELAXED: __res = __cuda_cas(__atomic_cuda_volatile{}); break;
+        default: assert(0);
+      }
     )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+  )
+  return __res;
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_block_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp = -__tmp;
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_block(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_block(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_block(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.cas.acquire.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.acquire.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.cas.acquire.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.cas.acquire.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.cas.relaxed.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.relaxed.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.cas.relaxed.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.cas.relaxed.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.cas.release.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.release.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.cas.release.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.cas.release.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.cas.acq_rel.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.acq_rel.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.cas.acq_rel.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.cas.acq_rel.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.cas.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.cas.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.cas.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.acquire.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.relaxed.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.release.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.acq_rel.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.cluster.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr), "r"(__cmp), "r"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.acquire.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.relaxed.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.release.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.acq_rel.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.cas.cluster.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr), "l"(__cmp), "l"(__op) : "memory"); return __dst == __cmp; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acquire.cta.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acquire.cluster.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acquire.gpu.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acquire.sys.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.relaxed.cta.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.relaxed.sys.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.release.cta.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.release.cluster.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.release.gpu.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.release.sys.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.cta.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.cluster.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.gpu.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+template <class _Type>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+.reg .b128 _d;
+.reg .b128 _v;
+mov.b128 {%0, %1}, _d;
+mov.b128 {%4, %5}, _v;
+atom.cas.sys.b128 _d,[%2],_d,_v;
+mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_compare_exchange {
+  _Type* __ptr;
+  _Type* __exp;
+  _Type* __des;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE bool operator()(_Atomic_Memorder) {
+    return __cuda_atomic_compare_exchange(__ptr, *__exp, *__exp, *__des, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  __proxy_t* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
+  __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy};
+  return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type volatile* __ptr, _Type* __exp, _Type __des, bool, int __success_memorder, int __failure_memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  __proxy_t* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
+  __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy};
+  return __cuda_atomic_compare_swap_memory_order_dispatch(__bound_compare_swap, __success_memorder, __failure_memorder, _Sco{});
 }
-static inline _CCCL_DEVICE void __cuda_membar_device() { asm volatile("membar.gl;":::"memory"); }
-static inline _CCCL_DEVICE void __cuda_fence_acq_rel_device() { asm volatile("fence.acq_rel.gpu;":::"memory"); }
-static inline _CCCL_DEVICE void __cuda_fence_sc_device() { asm volatile("fence.sc.gpu;":::"memory"); }
-static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thread_scope_device_tag) {
+
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn& __cuda_exch, int __memorder, _Sco) {
   NV_DISPATCH_TARGET(
     NV_PROVIDES_SM_70, (
       switch (__memorder) {
-        case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); break;
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
         case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
-        case __ATOMIC_RELEASE: __cuda_fence_acq_rel_device(); break;
-        case __ATOMIC_RELAXED: break;
+        case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_ACQ_REL: __cuda_exch(__atomic_cuda_acq_rel{}); break;
+        case __ATOMIC_RELEASE: __cuda_exch(__atomic_cuda_release{}); break;
+        case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_relaxed{}); break;
         default: assert(0);
       }
     ),
     NV_IS_DEVICE, (
       switch (__memorder) {
         case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
         case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
-        case __ATOMIC_RELEASE: __cuda_membar_device(); break;
-        case __ATOMIC_RELAXED: break;
+        case __ATOMIC_ACQUIRE: __cuda_exch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_exch(__atomic_cuda_volatile{}); break;
+        case __ATOMIC_RELAXED: __cuda_exch(__atomic_cuda_volatile{}); break;
         default: assert(0);
       }
     )
   )
 }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    uint32_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_device(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_device(__ptr, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    uint32_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_device(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_device(__ptr, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    uint64_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_device(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_device(__ptr, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    uint64_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_device(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_device(__ptr, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_relaxed_32_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.gpu.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_release_32_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.gpu.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_volatile_32_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) {
-    uint32_t __tmp = 0;
-    memcpy(&__tmp, __val, 4);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_32_device(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) {
-    uint32_t __tmp = 0;
-    memcpy(&__tmp, __val, 4);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_32_device(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_32_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_relaxed_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.gpu.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_release_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.gpu.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_volatile_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) {
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, __val, 8);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_64_device(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) {
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, __val, 8);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_64_device(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_64_device(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_device(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_device_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
-}
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp = -__tmp;
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
-}
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.exch.acquire.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.acquire.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.exch.acquire.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.exch.acquire.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.exch.relaxed.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.relaxed.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.exch.relaxed.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.exch.relaxed.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.exch.release.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.release.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.exch.release.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.exch.release.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.exch.acq_rel.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.acq_rel.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.exch.acq_rel.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.exch.acq_rel.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag)
+{ asm volatile("atom.exch.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag)
+{ asm volatile("atom.exch.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag)
+{ asm volatile("atom.exch.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.exch.acquire.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.exch.acquire.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.exch.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.exch.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.exch.release.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.release.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.exch.release.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.exch.release.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.exch.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.exch.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.exch.cta.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.cluster.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.exch.gpu.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.exch.sys.b32 %0,[%1],%2;" : "=r"(__old) : "l"(__ptr), "r"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.exch.acquire.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.exch.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.exch.acquire.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.exch.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.exch.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.exch.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.exch.release.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.release.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.exch.release.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.exch.release.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.exch.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.exch.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.exch.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.exch.cta.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.exch.cluster.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.exch.gpu.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.exch.sys.b64 %0,[%1],%2;" : "=l"(__old) : "l"(__ptr), "l"(__new) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acquire.cta.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acquire.cluster.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acquire.gpu.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acquire.sys.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.relaxed.cta.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.relaxed.cluster.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.relaxed.gpu.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.relaxed.sys.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.release.cta.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.release.cluster.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.release.gpu.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.release.sys.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acq_rel.cta.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acq_rel.cluster.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acq_rel.gpu.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.acq_rel.sys.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.cta.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.cluster.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.gpu.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
+}
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(
+  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag)
+{
+  asm volatile(R"YYY(
+    .reg .b128 _d;
+    .reg .b128 _v;
+    mov.b128 {%3, %4}, _v;
+    atom.exch.sys.b128 _d,[%2],_v;
+    mov.b128 _d, {%0, %1};
+)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_device_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp = -__tmp;
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_device(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_device(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_device(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_exchange {
+  _Type* __ptr;
+  _Type* __old;
+  _Type* __new;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_exchange(__ptr, *__old, *__new, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __old, _Type __new, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  __proxy_t* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
+  __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy};
+  __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{});
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type volatile* __ptr, _Type& __old, _Type __new, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  __proxy_t* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
+  __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy};
+  __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{});
 }
-static inline _CCCL_DEVICE void __cuda_membar_system() { asm volatile("membar.sys;":::"memory"); }
-static inline _CCCL_DEVICE void __cuda_fence_acq_rel_system() { asm volatile("fence.acq_rel.sys;":::"memory"); }
-static inline _CCCL_DEVICE void __cuda_fence_sc_system() { asm volatile("fence.sc.sys;":::"memory"); }
-static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, __thread_scope_system_tag) {
+
+template <class _Fn, class _Sco>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_memory_order_dispatch(_Fn& __cuda_fetch, int __memorder, _Sco) {
   NV_DISPATCH_TARGET(
     NV_PROVIDES_SM_70, (
       switch (__memorder) {
-        case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); break;
+        case __ATOMIC_SEQ_CST: __cuda_atomic_fence(_Sco{}, __atomic_cuda_seq_cst{}); _CCCL_FALLTHROUGH();
         case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
-        case __ATOMIC_RELEASE: __cuda_fence_acq_rel_system(); break;
-        case __ATOMIC_RELAXED: break;
+        case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_acquire{}); break;
+        case __ATOMIC_ACQ_REL: __cuda_fetch(__atomic_cuda_acq_rel{}); break;
+        case __ATOMIC_RELEASE: __cuda_fetch(__atomic_cuda_release{}); break;
+        case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_relaxed{}); break;
         default: assert(0);
       }
     ),
     NV_IS_DEVICE, (
       switch (__memorder) {
         case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
+        case __ATOMIC_ACQ_REL: __cuda_atomic_membar(_Sco{}); _CCCL_FALLTHROUGH();
         case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQUIRE: _CCCL_FALLTHROUGH();
-        case __ATOMIC_ACQ_REL: _CCCL_FALLTHROUGH();
-        case __ATOMIC_RELEASE: __cuda_membar_system(); break;
-        case __ATOMIC_RELAXED: break;
+        case __ATOMIC_ACQUIRE: __cuda_fetch(__atomic_cuda_volatile{}); __cuda_atomic_membar(_Sco{}); break;
+        case __ATOMIC_RELEASE: __cuda_atomic_membar(_Sco{}); __cuda_fetch(__atomic_cuda_volatile{}); break;
+        case __ATOMIC_RELAXED: __cuda_fetch(__atomic_cuda_volatile{}); break;
         default: assert(0);
       }
     )
   )
 }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    uint32_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_system(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_system(__ptr, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    uint32_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_system(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_system(__ptr, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_load_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.volatile.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    uint64_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_system(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_system(__ptr, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_load_cuda(const _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    uint64_t __tmp = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_system(__ptr, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_load_relaxed_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_system(__ptr, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELAXED: __cuda_load_volatile_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_relaxed_32_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.sys.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_release_32_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.sys.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_volatile_32_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) {
-    uint32_t __tmp = 0;
-    memcpy(&__tmp, __val, 4);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_32_system(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) {
-    uint32_t __tmp = 0;
-    memcpy(&__tmp, __val, 4);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_32_system(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_32_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_relaxed_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.sys.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_release_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.sys.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _CUDA_A, class _CUDA_B> static inline _CCCL_DEVICE void __cuda_store_volatile_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.volatile.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) {
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, __val, 8);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_64_system(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE void __atomic_store_cuda(_Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) {
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, __val, 8);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: __cuda_store_release_64_system(__ptr, __tmp); break;
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_relaxed_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_RELEASE: _CCCL_FALLTHROUGH();
-          case __ATOMIC_SEQ_CST: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_RELAXED: __cuda_store_volatile_64_system(__ptr, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=f"(__dst) : "l"(__ptr),"f"(__cmp),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u32_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 4);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr),"f"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u32_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_and_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.and.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_or_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.or.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_xor_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.xor.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=d"(__dst) : "l"(__ptr),"d"(__cmp),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_f64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_f64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
-    auto __old = *__expected;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_u64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break;
-          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_u64_system(__ptr, *__expected, __old, __desired); break;
-          default: assert(0);
-        }
-      )
-    )
-    return (__old == *__expected);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_exchange_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(volatile void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE void __atomic_exchange_cuda(void *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = *__val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_exchange_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_exchange_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(__ret, &__tmp, 8);
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_add_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_max_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.max.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_max_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_s64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_s64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_min_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.min.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_min_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_f64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr),"d"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
-}
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_f64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_block_tag)
+{ asm volatile("atom.add.acquire.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_block_tag)
+{ asm volatile("atom.add.relaxed.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_block_tag)
+{ asm volatile("atom.add.release.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_block_tag)
+{ asm volatile("atom.add.acq_rel.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_block_tag)
+{ asm volatile("atom.add.cta.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acquire.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.relaxed.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.release.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acq_rel.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.cluster.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_device_tag)
+{ asm volatile("atom.add.acquire.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_device_tag)
+{ asm volatile("atom.add.relaxed.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_device_tag)
+{ asm volatile("atom.add.release.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_device_tag)
+{ asm volatile("atom.add.acq_rel.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_device_tag)
+{ asm volatile("atom.add.gpu.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f32, __thread_scope_system_tag)
+{ asm volatile("atom.add.acquire.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f32, __thread_scope_system_tag)
+{ asm volatile("atom.add.relaxed.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f32, __thread_scope_system_tag)
+{ asm volatile("atom.add.release.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f32, __thread_scope_system_tag)
+{ asm volatile("atom.add.acq_rel.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f32, __thread_scope_system_tag)
+{ asm volatile("atom.add.sys.f32 %0,[%1],%2;" : "=f"(__dst) : "l"(__ptr), "f"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_block_tag)
+{ asm volatile("atom.add.acquire.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_block_tag)
+{ asm volatile("atom.add.relaxed.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_block_tag)
+{ asm volatile("atom.add.release.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_block_tag)
+{ asm volatile("atom.add.acq_rel.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_block_tag)
+{ asm volatile("atom.add.cta.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acquire.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.relaxed.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.release.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acq_rel.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.cluster.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_device_tag)
+{ asm volatile("atom.add.acquire.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_device_tag)
+{ asm volatile("atom.add.relaxed.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_device_tag)
+{ asm volatile("atom.add.release.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_device_tag)
+{ asm volatile("atom.add.acq_rel.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_device_tag)
+{ asm volatile("atom.add.gpu.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_f64, __thread_scope_system_tag)
+{ asm volatile("atom.add.acquire.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_f64, __thread_scope_system_tag)
+{ asm volatile("atom.add.relaxed.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_f64, __thread_scope_system_tag)
+{ asm volatile("atom.add.release.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_f64, __thread_scope_system_tag)
+{ asm volatile("atom.add.acq_rel.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_f64, __thread_scope_system_tag)
+{ asm volatile("atom.add.sys.f64 %0,[%1],%2;" : "=d"(__dst) : "l"(__ptr), "d"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.add.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.add.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.add.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.add.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.add.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acquire.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.relaxed.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.release.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acq_rel.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.add.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.add.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.add.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.add.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.add.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.add.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.add.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.add.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.add.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.add.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.add.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.add.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.add.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acquire.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.relaxed.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.release.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acq_rel.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.add.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.add.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.add.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.add.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.add.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.add.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.add.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.add.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.add.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.add.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.add.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.add.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.add.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.add.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.add.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acquire.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.relaxed.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.release.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.acq_rel.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.add.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.add.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.add.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.add.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.add.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.add.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.add.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.add.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.add.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.add.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.add.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_fetch_add {
+  _Type* __ptr;
+  _Type* __dst;
+  _Type* __op;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_fetch_add(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmetic<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_arithmetic<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{});
+  return __dst;
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmetic<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_arithmetic<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{});
+  return __dst;
 }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acq_rel_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_acquire_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_relaxed_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_release_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _CCCL_DEVICE void __cuda_fetch_sub_volatile_u64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { __op = -__op;
-asm volatile("atom.add.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.and.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.and.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.and.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.and.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.and.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.release.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.and.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.and.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.and.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.and.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.and.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.and.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.and.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.and.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.and.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.and.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.and.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.and.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.and.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.and.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.and.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.release.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.and.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.and.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.and.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.and.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.and.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.and.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.and.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.and.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.and.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.and.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.and.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_fetch_and {
+  _Type* __ptr;
+  _Type* __dst;
+  _Type* __op;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_fetch_and(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{});
+  return __dst;
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{});
+  return __dst;
 }
-template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8 && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>
-_CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type *__ptr, _Type __val, int __memorder, __thread_scope_system_tag) {
-    _Type __tmp = __val;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_sub_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    return __tmp;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.max.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.max.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.max.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.max.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.max.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acquire.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.relaxed.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.release.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acq_rel.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.max.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.max.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.max.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.max.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.max.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.max.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.max.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.max.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.max.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.max.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.max.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.max.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.max.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.max.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.max.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acquire.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.relaxed.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.release.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acq_rel.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.max.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.max.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.max.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.max.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.max.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.max.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.max.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.max.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.max.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.max.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.max.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.max.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.max.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.max.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.max.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acquire.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.relaxed.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.release.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acq_rel.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.max.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.max.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.max.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.max.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.max.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.max.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.max.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.max.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.max.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.max.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.max.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.max.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.max.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.max.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.max.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acquire.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.relaxed.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.release.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.acq_rel.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.max.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.max.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.max.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.max.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.max.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.max.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.max.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.max.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.max.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.max.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.max.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_fetch_max {
+  _Type* __ptr;
+  _Type* __dst;
+  _Type* __op;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_fetch_max(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{});
+  return __dst;
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{});
+  return __dst;
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.min.acquire.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.min.relaxed.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.min.release.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.min.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_block_tag)
+{ asm volatile("atom.min.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acquire.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.relaxed.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.release.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acq_rel.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.cluster.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.min.acquire.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.min.relaxed.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.min.release.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.min.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_device_tag)
+{ asm volatile("atom.min.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.min.acquire.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.min.relaxed.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.min.release.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.min.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u32, __thread_scope_system_tag)
+{ asm volatile("atom.min.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.min.acquire.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.min.relaxed.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.min.release.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.min.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_block_tag)
+{ asm volatile("atom.min.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acquire.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.relaxed.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.release.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acq_rel.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.cluster.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.min.acquire.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.min.relaxed.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.min.release.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.min.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_device_tag)
+{ asm volatile("atom.min.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.min.acquire.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.min.relaxed.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.min.release.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.min.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_u64, __thread_scope_system_tag)
+{ asm volatile("atom.min.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.min.acquire.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.min.relaxed.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.min.release.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.min.acq_rel.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_block_tag)
+{ asm volatile("atom.min.cta.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acquire.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.relaxed.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.release.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acq_rel.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.cluster.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.min.acquire.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.min.relaxed.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.min.release.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.min.acq_rel.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_device_tag)
+{ asm volatile("atom.min.gpu.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.min.acquire.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.min.relaxed.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.min.release.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.min.acq_rel.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s32, __thread_scope_system_tag)
+{ asm volatile("atom.min.sys.s32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.min.acquire.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.min.relaxed.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.min.release.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.min.acq_rel.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_block_tag)
+{ asm volatile("atom.min.cta.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acquire.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.relaxed.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.release.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.acq_rel.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_cluster_tag)
+{ asm volatile("atom.min.cluster.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.min.acquire.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.min.relaxed.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.min.release.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.min.acq_rel.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_device_tag)
+{ asm volatile("atom.min.gpu.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.min.acquire.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.min.relaxed.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.min.release.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.min.acq_rel.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_s64, __thread_scope_system_tag)
+{ asm volatile("atom.min.sys.s64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_fetch_min {
+  _Type* __ptr;
+  _Type* __dst;
+  _Type* __op;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_fetch_min(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{});
+  return __dst;
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{});
+  return __dst;
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp = -__tmp;
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.or.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.or.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.or.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.or.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.release.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.or.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.or.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.or.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.or.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.or.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.or.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.or.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.or.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.or.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.or.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.or.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.or.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.or.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.or.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.or.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.release.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.or.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.or.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.or.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.or.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.or.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.or.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.or.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.or.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.or.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.or.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.or.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_fetch_or {
+  _Type* __ptr;
+  _Type* __dst;
+  _Type* __op;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_fetch_or(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{});
+  return __dst;
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{});
+  return __dst;
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_add_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.xor.acquire.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.xor.relaxed.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.xor.release.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.xor.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_block_tag)
+{ asm volatile("atom.xor.cta.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.acquire.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.relaxed.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.release.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.acq_rel.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.cluster.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.xor.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.xor.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.xor.release.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.xor.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_device_tag)
+{ asm volatile("atom.xor.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.xor.acquire.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.xor.relaxed.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.xor.release.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.xor.acq_rel.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b32, __thread_scope_system_tag)
+{ asm volatile("atom.xor.sys.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr), "r"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.xor.acquire.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.xor.relaxed.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.xor.release.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.xor.acq_rel.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_block_tag)
+{ asm volatile("atom.xor.cta.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.acquire.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.relaxed.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.release.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.acq_rel.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_cluster_tag)
+{ asm volatile("atom.xor.cluster.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.xor.acquire.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.xor.relaxed.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.xor.release.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.xor.acq_rel.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_device_tag)
+{ asm volatile("atom.xor.gpu.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.xor.acquire.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.xor.relaxed.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.xor.release.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.xor.acq_rel.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(
+  _Type* __ptr, _Type& __dst, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b64, __thread_scope_system_tag)
+{ asm volatile("atom.xor.sys.b64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr), "l"(__op) : "memory"); }
+
+template <typename _Type, typename _Tag, typename _Sco>
+struct __cuda_atomic_bind_fetch_xor {
+  _Type* __ptr;
+  _Type* __dst;
+  _Type* __op;
+
+  template <typename _Atomic_Memorder>
+  inline _CCCL_DEVICE void operator()(_Atomic_Memorder) {
+    __cuda_atomic_fetch_xor(__ptr, *__dst, *__op, _Atomic_Memorder{}, _Tag{}, _Sco{});
+  }
+};
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{});
+  return __dst;
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op = __op * __skip_v;
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy};
+  __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{});
+  return __dst;
 }
-template<class _Type>
-_CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int __memorder, __thread_scope_system_tag) {
-    _Type* __ret;
-    uint64_t __tmp = 0;
-    memcpy(&__tmp, &__val, 8);
-    __tmp = -__tmp;
-    __tmp *= sizeof(_Type);
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_70, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELEASE: __cuda_fetch_add_release_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_u64_system(__ptr, __tmp, __tmp); break;
-        }
-      ),
-      NV_IS_DEVICE, (
-        switch (__memorder) {
-          case __ATOMIC_SEQ_CST: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQ_REL: __cuda_membar_system(); _CCCL_FALLTHROUGH();
-          case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();
-          case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break;
-          case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_u64_system(__ptr, __tmp, __tmp); break;
-          default: assert(0);
-        }
-      )
-    )
-    memcpy(&__ret, &__tmp, 8);
-    return __ret;
+
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
+}
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
 }
 
 #endif // defined(_CCCL_CUDA_COMPILER)
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
new file mode 100644
index 0000000000..861e9f7b08
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
@@ -0,0 +1,155 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/is_scalar.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+enum class __atomic_cuda_memorder
+{
+  _relaxed,
+  _release,
+  _acquire,
+  _acq_rel,
+  _seq_cst,
+  _volatile,
+};
+
+template <__atomic_cuda_memorder _Order>
+using __atomic_cuda_memorder_tag = integral_constant<__atomic_cuda_memorder, _Order>;
+
+using __atomic_cuda_relaxed  = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_relaxed>;
+using __atomic_cuda_release  = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_release>;
+using __atomic_cuda_acquire  = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_acquire>;
+using __atomic_cuda_acq_rel  = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_acq_rel>;
+using __atomic_cuda_seq_cst  = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_seq_cst>;
+using __atomic_cuda_volatile = __atomic_cuda_memorder_tag<__atomic_cuda_memorder::_volatile>;
+
+template <bool _Volatile>
+using __atomic_cuda_mmio_tag = integral_constant<bool, _Volatile>;
+
+using __atomic_cuda_mmio_enable  = __atomic_cuda_mmio_tag<true>;
+using __atomic_cuda_mmio_disable = __atomic_cuda_mmio_tag<false>;
+
+enum class __atomic_cuda_operand
+{
+  _f,
+  _s,
+  _u,
+  _b,
+};
+
+template <__atomic_cuda_operand _Op, size_t _Size>
+struct __atomic_cuda_operand_tag
+{};
+
+using __atomic_cuda_operand_f16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 16>;
+using __atomic_cuda_operand_s16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 16>;
+using __atomic_cuda_operand_u16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 16>;
+using __atomic_cuda_operand_b16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 16>;
+using __atomic_cuda_operand_f32  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 32>;
+using __atomic_cuda_operand_s32  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 32>;
+using __atomic_cuda_operand_u32  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 32>;
+using __atomic_cuda_operand_b32  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 32>;
+using __atomic_cuda_operand_f64  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 64>;
+using __atomic_cuda_operand_s64  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 64>;
+using __atomic_cuda_operand_u64  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 64>;
+using __atomic_cuda_operand_b64  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 64>;
+using __atomic_cuda_operand_f128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 128>;
+using __atomic_cuda_operand_s128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 128>;
+using __atomic_cuda_operand_u128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 128>;
+using __atomic_cuda_operand_b128 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 128>;
+
+template <class _AtomicType, class _OpTag>
+struct __atomic_cuda_operand_deduction
+{
+  using __type = _AtomicType;
+  using __tag  = _OpTag;
+};
+
+struct __atomic_longlong2
+{
+  uint64_t __x;
+  uint64_t __y;
+};
+
+template <class _Type>
+using __atomic_cuda_deduce_bitwise =
+  _If<sizeof(_Type) == 2,
+      __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_b16>,
+      _If<sizeof(_Type) == 4,
+          __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_b32>,
+          _If<sizeof(_Type) == 8,
+              __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_b64>,
+              __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>;
+
+template <class _Type>
+using __atomic_cuda_deduce_arithmetic =
+  _If<_CCCL_TRAIT(is_floating_point, _Type),
+      _If<sizeof(_Type) == 4,
+          __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
+          __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
+      _If<_CCCL_TRAIT(is_signed, _Type),
+          _If<sizeof(_Type) == 4,
+              __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
+              __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_u64>>, // There is no atom.add.s64
+          _If<sizeof(_Type) == 4,
+              __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
+              __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>;
+
+template <class _Type>
+using __atomic_cuda_deduce_minmax =
+  _If<_CCCL_TRAIT(is_signed, _Type),
+      _If<sizeof(_Type) == 4,
+          __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
+          __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_s64>>,
+      _If<sizeof(_Type) == 4,
+          __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
+          __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>;
+
+template <class _Type>
+using __atomic_enable_if_native_bitwise = bool;
+
+template <class _Type>
+using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type;
+
+template <class _Type>
+using __atomic_enable_if_not_native_arithmetic = typename enable_if<!_CCCL_TRAIT(is_scalar, _Type), bool>::type;
+
+template <class _Type>
+using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type;
+
+template <class _Type>
+using __atomic_enable_if_not_native_minmax = typename enable_if<!_CCCL_TRAIT(is_integral, _Type), bool>::type;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index e6aeaa36fc..736fd0b0b0 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__atomic/functions/common.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/platform.h>
 #include <cuda/std/__type_traits/enable_if.h>
@@ -109,30 +110,6 @@ inline bool __atomic_compare_exchange_weak_host(
     __atomic_failure_order_to_int(__failure));
 }
 
-template <typename _Tp>
-struct __atomic_ptr_skip
-{
-  static constexpr auto __skip = 1;
-};
-
-template <typename _Tp>
-struct __atomic_ptr_skip<_Tp*>
-{
-  static constexpr auto __skip = sizeof(_Tp);
-};
-
-// FIXME: Haven't figured out what the spec says about using arrays with
-// atomic_fetch_add. Force a failure rather than creating bad behavior.
-template <typename _Tp>
-struct __atomic_ptr_skip<_Tp[]>
-{};
-template <typename _Tp, int n>
-struct __atomic_ptr_skip<_Tp[n]>
-{};
-
-template <typename _Tp>
-using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>;
-
 template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
 inline __remove_cv_t<_Tp> __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
 {
diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h
index 70af777d5c..22637c186e 100644
--- a/libcudacxx/include/cuda/std/__atomic/scopes.h
+++ b/libcudacxx/include/cuda/std/__atomic/scopes.h
@@ -44,6 +44,8 @@ struct __thread_scope_thread_tag
 {};
 struct __thread_scope_block_tag
 {};
+struct __thread_scope_cluster_tag
+{};
 struct __thread_scope_device_tag
 {};
 struct __thread_scope_system_tag
diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt
index 856318015f..095fa41cf7 100644
--- a/libcudacxx/test/atomic_codegen/CMakeLists.txt
+++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt
@@ -1,5 +1,4 @@
-# For every atomic API compile the TU and check if the SASS matches the expected result
-add_custom_target(libcudacxx.test.atomic_codegen)
+add_custom_target(libcudacxx.test.atomics.ptx)
 
 find_program(filecheck "FileCheck" REQUIRED)
 find_program(cuobjdump "cuobjdump" REQUIRED)
@@ -7,6 +6,7 @@ find_program(bash "bash" REQUIRED)
 
 file(GLOB libcudacxx_atomic_codegen_tests "*.cu")
 
+# For every atomic API compile the TU and check if the SASS/PTX matches the expected result
 foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests)
     cmake_path(GET test_path FILENAME test_file)
     cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
@@ -18,11 +18,11 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests)
 
     ## Important for testing the local headers
     target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include")
-    add_dependencies(libcudacxx.test.atomic_codegen atomic_codegen_${test_name})
+    add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name})
 
     # Add output path to object directory
     add_custom_command(
-        TARGET libcudacxx.test.atomic_codegen
+        TARGET libcudacxx.test.atomics.ptx
         POST_BUILD
         COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/dump_and_check.bash $<TARGET_FILE:atomic_codegen_${test_name}> ${test_path} SM8X
     )
diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
index ff2850009f..d97636d647 100644
--- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
@@ -11,11 +11,11 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n)
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]](
 ; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
-; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
+; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
 ; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.u32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
+; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
 ; SM8X-NEXT: //
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
index 12eb48622b..983c8e9fac 100644
--- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
@@ -13,7 +13,7 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in)
 ; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_1];
 ; SM8X-NEXT: //
-; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]], %r[[#INPUT]];
+; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];
 ; SM8X-NEXT: //
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
index d32696e826..9d1ffaefa1 100644
--- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
@@ -11,12 +11,12 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n)
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*sub_relaxed_device_non_volatile.*]](
 ; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
-; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
+; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
 ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]];
 ; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.u32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];
+; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];
 ; SM8X-NEXT: //
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;

From 47b8f5ccdf46358b27fbf156b5dab509fc6ebdac Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Wed, 7 Aug 2024 12:40:56 -0700
Subject: [PATCH 08/33] [CUDAX] add `__launch_transform` to transform arguments
 to `cudax::launch` prior to launching the kernel (#2202)

* add `__launch_transform` to transform arguments to `cudax::launch` prior to launching the kernel
---
 .../cuda/experimental/__detail/utility.cuh    | 20 ++++
 .../cuda/experimental/__launch/launch.cuh     | 94 +++++++++++++------
 .../__launch/launch_transform.cuh             | 83 ++++++++++++++++
 .../__utility/ensure_current_device.cuh       |  2 +-
 cudax/test/launch/launch_smoke.cu             | 56 +++++++++++
 5 files changed, 223 insertions(+), 32 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__launch/launch_transform.cuh

diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh
index 874075b107..738a5d6244 100644
--- a/cudax/include/cuda/experimental/__detail/utility.cuh
+++ b/cudax/include/cuda/experimental/__detail/utility.cuh
@@ -11,8 +11,28 @@
 #ifndef __CUDAX_DETAIL_UTILITY_H
 #define __CUDAX_DETAIL_UTILITY_H
 
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 namespace cuda::experimental
 {
+namespace detail
+{
+struct __ignore
+{
+  template <typename... Args>
+  _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept
+  {}
+};
+} // namespace detail
+
 struct uninit_t
 {
   explicit uninit_t() = default;
diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
index 1a49cafa40..f4aee8a173 100644
--- a/cudax/include/cuda/experimental/__launch/launch.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -16,6 +16,7 @@
 #include <cuda/stream_ref>
 
 #include <cuda/experimental/__launch/configuration.cuh>
+#include <cuda/experimental/__launch/launch_transform.cuh>
 #include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 #if _CCCL_STD_VER >= 2017
@@ -120,18 +121,25 @@ template <typename... Args, typename... Config, typename Dimensions, typename Ke
 void launch(
   ::cuda::stream_ref stream, const kernel_config<Dimensions, Config...>& conf, const Kernel& kernel, Args... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
+  __ensure_current_device __dev_setter(stream);
   cudaError_t status;
-  if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, Args...>)
+  if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, as_kernel_arg_t<Args>...>)
   {
-    auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, Args...>;
-    status        = detail::launch_impl(stream, conf, launcher, conf, kernel, args...);
+    auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream,
+      conf,
+      launcher,
+      conf,
+      kernel,
+      static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   else
   {
-    static_assert(::cuda::std::is_invocable_v<Kernel, Args...>);
-    auto launcher = detail::kernel_launcher_no_config<Kernel, Args...>;
-    status        = detail::launch_impl(stream, conf, launcher, kernel, args...);
+    static_assert(::cuda::std::is_invocable_v<Kernel, as_kernel_arg_t<Args>...>);
+    auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream, conf, launcher, kernel, static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   if (status != cudaSuccess)
   {
@@ -183,18 +191,29 @@ void launch(
 template <typename... Args, typename... Levels, typename Kernel>
 void launch(::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, const Kernel& kernel, Args... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
+  __ensure_current_device __dev_setter(stream);
   cudaError_t status;
-  if constexpr (::cuda::std::is_invocable_v<Kernel, hierarchy_dimensions<Levels...>, Args...>)
+  if constexpr (::cuda::std::is_invocable_v<Kernel, hierarchy_dimensions<Levels...>, as_kernel_arg_t<Args>...>)
   {
-    auto launcher = detail::kernel_launcher<hierarchy_dimensions<Levels...>, Kernel, Args...>;
-    status        = detail::launch_impl(stream, kernel_config(dims), launcher, dims, kernel, args...);
+    auto launcher = detail::kernel_launcher<hierarchy_dimensions<Levels...>, Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream,
+      kernel_config(dims),
+      launcher,
+      dims,
+      kernel,
+      static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   else
   {
-    static_assert(::cuda::std::is_invocable_v<Kernel, Args...>);
-    auto launcher = detail::kernel_launcher_no_config<Kernel, Args...>;
-    status        = detail::launch_impl(stream, kernel_config(dims), launcher, kernel, args...);
+    static_assert(::cuda::std::is_invocable_v<Kernel, as_kernel_arg_t<Args>...>);
+    auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream,
+      kernel_config(dims),
+      launcher,
+      kernel,
+      static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   if (status != cudaSuccess)
   {
@@ -248,10 +267,14 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(kernel_config<Dimensions, Config...>, ExpArgs...),
             ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, conf, kernel, conf, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream, //
+    conf,
+    kernel,
+    conf,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
@@ -303,10 +326,14 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(hierarchy_dimensions<Levels...>, ExpArgs...),
             ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, kernel_config(dims), kernel, dims, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream,
+    kernel_config(dims),
+    kernel,
+    dims,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
@@ -320,7 +347,6 @@ void launch(::cuda::stream_ref stream,
  * Kernel function is a function with __global__ annotation.
  * Function might or might not accept the configuration as its first argument.
  *
- *
  * @par Snippet
  * @code
  * #include <cstdio>
@@ -359,10 +385,13 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(ExpArgs...),
             ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, conf, kernel, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream, //
+    conf,
+    kernel,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
@@ -412,10 +441,13 @@ template <typename... ExpArgs, typename... ActArgs, typename... Levels>
 void launch(
   ::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, void (*kernel)(ExpArgs...), ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, kernel_config(dims), kernel, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream,
+    kernel_config(dims),
+    kernel,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh
new file mode 100644
index 0000000000..4692cf9376
--- /dev/null
+++ b/cudax/include/cuda/experimental/__launch/launch_transform.cuh
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__LAUNCH_LAUNCH_TRANSFORM
+#define _CUDAX__LAUNCH_LAUNCH_TRANSFORM
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/decay.h>
+#include <cuda/std/utility>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+#if _CCCL_STD_VER >= 2017
+namespace cuda::experimental
+{
+namespace detail
+{
+// Types should define overloads of __cudax_launch_transform that are find-able
+// by ADL in order to customize how cudax::launch handles that type. The
+// overload below, which simply returns the argument unmodified, is the overload
+// that gets chosen if no other overload matches. It takes __ignore as the first
+// argument to make this overload less preferred than other overloads that take
+// a stream_ref as the first argument.
+template <typename _Arg>
+_CCCL_NODISCARD constexpr _Arg&& __cudax_launch_transform(__ignore, _Arg&& __arg) noexcept
+{
+  return _CUDA_VSTD::forward<_Arg>(__arg);
+}
+
+template <typename _Arg>
+using __launch_transform_direct_result_t =
+  decltype(__cudax_launch_transform(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>()));
+
+struct __fn
+{
+  template <typename _Arg>
+  _CCCL_NODISCARD __launch_transform_direct_result_t<_Arg> operator()(::cuda::stream_ref __stream, _Arg&& __arg) const
+  {
+    // This call is unqualified to allow ADL
+    return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg));
+  }
+};
+
+template <typename _Arg, typename _Enable = void>
+struct __as_kernel_arg
+{
+  using type = _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>;
+};
+
+template <typename _Arg>
+struct __as_kernel_arg<
+  _Arg,
+  _CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg>>
+{
+  using type = typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg;
+};
+
+_CCCL_GLOBAL_CONSTANT __fn __launch_transform{};
+} // namespace detail
+
+template <typename _Arg>
+using as_kernel_arg_t = typename detail::__as_kernel_arg<_Arg>::type;
+
+} // namespace cuda::experimental
+
+#endif // _CCCL_STD_VER >= 2017
+#endif // !_CUDAX__LAUNCH_LAUNCH_TRANSFORM
diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
index 2431d02818..839adafb96 100644
--- a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
+++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
@@ -33,7 +33,7 @@ namespace cuda::experimental
 //! @brief RAII helper which on construction sets the current device to the specified one or one a
 //! stream was created under. It sets the state back on destruction.
 //!
-struct __ensure_current_device
+struct [[maybe_unused]] __ensure_current_device
 {
   //! @brief Construct a new `__ensure_current_device` object and switch to the specified
   //!        device.
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index 810e65c390..29d84d2e7c 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -104,6 +104,50 @@ struct dynamic_smem_span
   }
 };
 
+struct launch_transform_to_int_convertible
+{
+  int value_;
+
+  struct int_convertible
+  {
+    cudaStream_t stream_;
+    int value_;
+
+    int_convertible(cudaStream_t stream, int value) noexcept
+        : stream_(stream)
+        , value_(value)
+    {
+      // Check that the constructor runs before the kernel is launched
+      CHECK_FALSE(kernel_run_proof);
+    }
+
+    // Immovable to ensure that __launch_transform doesn't copy the returned
+    // object
+    int_convertible(int_convertible&&) = delete;
+
+    ~int_convertible() noexcept
+    {
+      // Check that the destructor runs after the kernel is launched
+      CUDART(cudaStreamSynchronize(stream_));
+      CHECK(kernel_run_proof);
+    }
+
+    using __as_kernel_arg = int;
+
+    // This is the value that will be passed to the kernel
+    explicit operator int() const
+    {
+      return value_;
+    }
+  };
+
+  _CCCL_NODISCARD_FRIEND int_convertible
+  __cudax_launch_transform(::cuda::stream_ref stream, launch_transform_to_int_convertible self) noexcept
+  {
+    return int_convertible(stream.get(), self.value_);
+  }
+};
+
 // Needs a separe function for Windows extended lambda
 void launch_smoke_test()
 {
@@ -127,10 +171,14 @@ void launch_smoke_test()
         check_kernel_run(stream);
         cudax::launch(stream, dims_or_conf, kernel_int_argument, 1);
         check_kernel_run(stream);
+        cudax::launch(stream, dims_or_conf, kernel_int_argument, launch_transform_to_int_convertible{1});
+        check_kernel_run(stream);
         cudax::launch(stream, dims_or_conf, functor_int_argument(), dummy);
         check_kernel_run(stream);
         cudax::launch(stream, dims_or_conf, functor_int_argument(), 1);
         check_kernel_run(stream);
+        cudax::launch(stream, dims_or_conf, functor_int_argument(), launch_transform_to_int_convertible{1});
+        check_kernel_run(stream);
 
         cudax::launch(stream, dims_or_conf, kernel_int_argument, 1U);
         check_kernel_run(stream);
@@ -150,11 +198,15 @@ void launch_smoke_test()
       check_kernel_run(stream);
       cudax::launch(stream, config, functor_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, config, functor_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, config, kernel_instance, grid_size);
       check_kernel_run(stream);
       cudax::launch(stream, config, kernel_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, config, kernel_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, config, functor_instance, static_cast<unsigned int>(grid_size));
       check_kernel_run(stream);
@@ -171,11 +223,15 @@ void launch_smoke_test()
       check_kernel_run(stream);
       cudax::launch(stream, dimensions, functor_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, dimensions, functor_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, dimensions, kernel_instance, grid_size);
       check_kernel_run(stream);
       cudax::launch(stream, dimensions, kernel_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, dimensions, kernel_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, dimensions, functor_instance, static_cast<unsigned int>(grid_size));
       check_kernel_run(stream);

From 39fd05e334dda0c5e4f4f75cd7ac44591e3ffdcd Mon Sep 17 00:00:00 2001
From: pciolkosz <pciolkosz@nvidia.com>
Date: Thu, 8 Aug 2024 00:32:06 -0700
Subject: [PATCH 09/33] Cleanup common testing headers and correct asserts in
 launch testing (#2204)

* Cleanup common testing headers

* Add test/common to cmake and fix formatting
---
 cudax/test/CMakeLists.txt                     |  1 +
 .../host_device.cuh}                          | 65 +----------------
 cudax/test/common/testing.cuh                 | 73 +++++++++++++++++++
 cudax/test/common/utility.cuh                 |  7 +-
 cudax/test/device/device_smoke.cu             |  2 +-
 cudax/test/event/event_smoke.cu               |  3 +-
 .../test/hierarchy/hierarchy_custom_types.cu  |  2 +-
 cudax/test/hierarchy/hierarchy_smoke.cu       |  2 +-
 cudax/test/launch/configuration.cu            |  2 +-
 cudax/test/launch/launch_smoke.cu             | 12 +--
 cudax/test/stream/get_stream.cu               |  2 +-
 cudax/test/stream/stream_smoke.cu             |  2 +-
 cudax/test/utility/driver_api.cu              |  2 +-
 cudax/test/utility/ensure_current_device.cu   |  2 +-
 14 files changed, 98 insertions(+), 79 deletions(-)
 rename cudax/test/{hierarchy/testing_common.cuh => common/host_device.cuh} (63%)
 create mode 100644 cudax/test/common/testing.cuh

diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index 4752f8b964..cda6623668 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -26,6 +26,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test
   set(test_sources ${ARGN})
 
   add_executable(${test_target} ${test_sources})
+  target_include_directories(${test_target} PRIVATE "common")
   target_link_libraries(${test_target} PRIVATE ${cn_target} Catch2::Catch2 catch2_main)
   target_link_libraries(${test_target} PRIVATE ${cn_target} cudax::Thrust)
   target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE")
diff --git a/cudax/test/hierarchy/testing_common.cuh b/cudax/test/common/host_device.cuh
similarity index 63%
rename from cudax/test/hierarchy/testing_common.cuh
rename to cudax/test/common/host_device.cuh
index 96dcef7369..b20ff7e923 100644
--- a/cudax/test/hierarchy/testing_common.cuh
+++ b/cudax/test/common/host_device.cuh
@@ -8,67 +8,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __TESTING_COMMON_H__
-#define __TESTING_COMMON_H__
+#ifndef __COMMON_HOST_DEVICE_H__
+#define __COMMON_HOST_DEVICE_H__
 
-#include <cuda/experimental/hierarchy.cuh>
-
-#include <exception>
-#include <iostream>
-#include <sstream>
-
-#include <catch2/catch.hpp>
-
-namespace cudax = cuda::experimental;
-
-#define CUDART(call) REQUIRE((call) == cudaSuccess)
-
-inline void __device__ cudax_require_impl(
-  bool condition, const char* condition_text, const char* filename, unsigned int linenum, const char* funcname)
-{
-  if (!condition)
-  {
-    // TODO do warp aggregate prints for easier readibility?
-    printf("%s:%u: %s: block: [%d,%d,%d], thread: [%d,%d,%d] Condition `%s` failed.\n",
-           filename,
-           linenum,
-           funcname,
-           blockIdx.x,
-           blockIdx.y,
-           blockIdx.z,
-           threadIdx.x,
-           threadIdx.y,
-           threadIdx.z,
-           condition_text);
-    __trap();
-  }
-}
-
-// TODO make it work on NVC++
-#ifdef __CUDA_ARCH__
-#  define CUDAX_REQUIRE(condition) cudax_require_impl(condition, #condition, __FILE__, __LINE__, __PRETTY_FUNCTION__);
-#else
-#  define CUDAX_REQUIRE REQUIRE
-#endif
-
-bool constexpr __host__ __device__ operator==(const dim3& lhs, const dim3& rhs)
-{
-  return (lhs.x == rhs.x) && (lhs.y == rhs.y) && (lhs.z == rhs.z);
-}
-
-namespace Catch
-{
-template <>
-struct StringMaker<dim3>
-{
-  static std::string convert(dim3 const& dims)
-  {
-    std::ostringstream oss;
-    oss << "(" << dims.x << ", " << dims.y << ", " << dims.z << ")";
-    return oss.str();
-  }
-};
-} // namespace Catch
+#include "testing.cuh"
 
 template <typename Dims, typename Lambda>
 void __global__ lambda_launcher(const Dims dims, const Lambda lambda)
@@ -155,4 +98,4 @@ void apply_each(const Fn& fn, const Tuple& tuple)
     tuple);
 }
 
-#endif
+#endif // __COMMON_HOST_DEVICE_H__
diff --git a/cudax/test/common/testing.cuh b/cudax/test/common/testing.cuh
new file mode 100644
index 0000000000..ca4537fd78
--- /dev/null
+++ b/cudax/test/common/testing.cuh
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __COMMON_TESTING_H__
+#define __COMMON_TESTING_H__
+
+#include <cuda/experimental/hierarchy.cuh>
+
+#include <exception>
+#include <iostream>
+#include <sstream>
+
+#include <catch2/catch.hpp>
+
+namespace cudax = cuda::experimental;
+
+#define CUDART(call) REQUIRE((call) == cudaSuccess)
+
+inline void __device__ cudax_require_impl(
+  bool condition, const char* condition_text, const char* filename, unsigned int linenum, const char* funcname)
+{
+  if (!condition)
+  {
+    // TODO do warp aggregate prints for easier readibility?
+    printf("%s:%u: %s: block: [%d,%d,%d], thread: [%d,%d,%d] Condition `%s` failed.\n",
+           filename,
+           linenum,
+           funcname,
+           blockIdx.x,
+           blockIdx.y,
+           blockIdx.z,
+           threadIdx.x,
+           threadIdx.y,
+           threadIdx.z,
+           condition_text);
+    __trap();
+  }
+}
+
+// TODO make it work on NVC++
+#ifdef __CUDA_ARCH__
+#  define CUDAX_REQUIRE(condition) cudax_require_impl(condition, #condition, __FILE__, __LINE__, __PRETTY_FUNCTION__);
+#else
+#  define CUDAX_REQUIRE REQUIRE
+#endif
+
+bool constexpr __host__ __device__ operator==(const dim3& lhs, const dim3& rhs)
+{
+  return (lhs.x == rhs.x) && (lhs.y == rhs.y) && (lhs.z == rhs.z);
+}
+
+namespace Catch
+{
+template <>
+struct StringMaker<dim3>
+{
+  static std::string convert(dim3 const& dims)
+  {
+    std::ostringstream oss;
+    oss << "(" << dims.x << ", " << dims.y << ", " << dims.z << ")";
+    return oss.str();
+  }
+};
+} // namespace Catch
+
+#endif // __COMMON_TESTING_H__
diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh
index 64a54e1b48..991d8fd25f 100644
--- a/cudax/test/common/utility.cuh
+++ b/cudax/test/common/utility.cuh
@@ -8,6 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef __COMMON_UTILITY_H__
+#define __COMMON_UTILITY_H__
+
 #include <cuda_runtime_api.h>
 // cuda_runtime_api needs to come first
 
@@ -18,8 +21,7 @@
 
 #include <new> // IWYU pragma: keep (needed for placement new)
 
-// TODO unify the common testing header
-#include "../hierarchy/testing_common.cuh"
+#include "testing.cuh"
 
 namespace
 {
@@ -174,3 +176,4 @@ inline void empty_driver_stack()
 
 } // namespace test
 } // namespace
+#endif // __COMMON_UTILITY_H__
diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu
index b98d05fc3b..d13400c8db 100644
--- a/cudax/test/device/device_smoke.cu
+++ b/cudax/test/device/device_smoke.cu
@@ -10,8 +10,8 @@
 
 #include <cuda/experimental/device.cuh>
 
-#include "../hierarchy/testing_common.cuh"
 #include "cuda/std/__type_traits/is_same.h"
+#include <testing.cuh>
 
 namespace
 {
diff --git a/cudax/test/event/event_smoke.cu b/cudax/test/event/event_smoke.cu
index ae5286a4f7..ddf9b271d1 100644
--- a/cudax/test/event/event_smoke.cu
+++ b/cudax/test/event/event_smoke.cu
@@ -10,9 +10,8 @@
 
 #include <cuda/experimental/event.cuh>
 
-#include "../common/utility.cuh"
-#include "../hierarchy/testing_common.cuh"
 #include <catch2/catch.hpp>
+#include <utility.cuh>
 
 namespace
 {
diff --git a/cudax/test/hierarchy/hierarchy_custom_types.cu b/cudax/test/hierarchy/hierarchy_custom_types.cu
index 5b06959eea..f35a4914ce 100644
--- a/cudax/test/hierarchy/hierarchy_custom_types.cu
+++ b/cudax/test/hierarchy/hierarchy_custom_types.cu
@@ -10,8 +10,8 @@
 
 #include <iostream>
 
-#include "testing_common.cuh"
 #include <cooperative_groups.h>
+#include <host_device.cuh>
 
 struct custom_level : public cudax::hierarchy_level
 {
diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu
index f6f55cf9f0..b43a077b79 100644
--- a/cudax/test/hierarchy/hierarchy_smoke.cu
+++ b/cudax/test/hierarchy/hierarchy_smoke.cu
@@ -10,8 +10,8 @@
 
 #include <iostream>
 
-#include "testing_common.cuh"
 #include <cooperative_groups.h>
+#include <host_device.cuh>
 
 namespace cg = cooperative_groups;
 
diff --git a/cudax/test/launch/configuration.cu b/cudax/test/launch/configuration.cu
index 9e7f98df1b..693d00ce16 100644
--- a/cudax/test/launch/configuration.cu
+++ b/cudax/test/launch/configuration.cu
@@ -14,7 +14,7 @@
 #include <cuda/experimental/launch.cuh>
 #undef cudaLaunchKernelEx
 
-#include "../hierarchy/testing_common.cuh"
+#include <host_device.cuh>
 
 static cudaLaunchConfig_t expectedConfig;
 static bool replacementCalled = false;
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index 29d84d2e7c..e9c6e7730a 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -11,7 +11,7 @@
 
 #include <cuda/experimental/launch.cuh>
 
-#include "../hierarchy/testing_common.cuh"
+#include <testing.cuh>
 
 __managed__ bool kernel_run_proof = false;
 
@@ -37,7 +37,7 @@ struct functor_taking_config
   __device__ void operator()(Config conf, int grid_size)
   {
     static_assert(conf.dims.static_count(cudax::thread, cudax::block) == BlockSize);
-    assert(conf.dims.count(cudax::block, cudax::grid) == grid_size);
+    CUDAX_REQUIRE(conf.dims.count(cudax::block, cudax::grid) == grid_size);
     kernel_run_proof = true;
   }
 };
@@ -49,7 +49,7 @@ struct functor_taking_dims
   __device__ void operator()(Dimensions dims, int grid_size)
   {
     static_assert(dims.static_count(cudax::thread, cudax::block) == BlockSize);
-    assert(dims.count(cudax::block, cudax::grid) == grid_size);
+    CUDAX_REQUIRE(dims.count(cudax::block, cudax::grid) == grid_size);
     kernel_run_proof = true;
   }
 };
@@ -84,7 +84,7 @@ struct dynamic_smem_single
   {
     auto& dynamic_smem = cudax::dynamic_smem_ref(conf);
     static_assert(::cuda::std::is_same_v<SmemType&, decltype(dynamic_smem)>);
-    assert(__isShared(&dynamic_smem));
+    CUDAX_REQUIRE(__isShared(&dynamic_smem));
     kernel_run_proof = true;
   }
 };
@@ -98,8 +98,8 @@ struct dynamic_smem_span
     auto dynamic_smem = cudax::dynamic_smem_span(conf);
     static_assert(decltype(dynamic_smem)::extent == Extent);
     static_assert(::cuda::std::is_same_v<SmemType&, decltype(dynamic_smem[1])>);
-    assert(dynamic_smem.size() == size);
-    assert(__isShared(&dynamic_smem[1]));
+    CUDAX_REQUIRE(dynamic_smem.size() == size);
+    CUDAX_REQUIRE(__isShared(&dynamic_smem[1]));
     kernel_run_proof = true;
   }
 };
diff --git a/cudax/test/stream/get_stream.cu b/cudax/test/stream/get_stream.cu
index 0654c3be39..80834bdd0f 100644
--- a/cudax/test/stream/get_stream.cu
+++ b/cudax/test/stream/get_stream.cu
@@ -10,8 +10,8 @@
 
 #include <cuda/experimental/stream.cuh>
 
-#include "../common/utility.cuh"
 #include <catch2/catch.hpp>
+#include <utility.cuh>
 
 TEST_CASE("Can call get_stream on a cudaStream_t", "[stream]")
 {
diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu
index cbee352080..50e55352a4 100644
--- a/cudax/test/stream/stream_smoke.cu
+++ b/cudax/test/stream/stream_smoke.cu
@@ -11,8 +11,8 @@
 #include <cuda/experimental/launch.cuh>
 #include <cuda/experimental/stream.cuh>
 
-#include "../common/utility.cuh"
 #include <catch2/catch.hpp>
+#include <utility.cuh>
 
 constexpr auto one_thread_dims = cudax::make_hierarchy(cudax::block_dims<1>(), cudax::grid_dims<1>());
 
diff --git a/cudax/test/utility/driver_api.cu b/cudax/test/utility/driver_api.cu
index e5fd64d14f..5955802fe1 100644
--- a/cudax/test/utility/driver_api.cu
+++ b/cudax/test/utility/driver_api.cu
@@ -10,7 +10,7 @@
 
 #include <cuda/experimental/__utility/driver_api.cuh>
 
-#include "../hierarchy/testing_common.cuh"
+#include <testing.cuh>
 
 TEST_CASE("Call each driver api", "[utility]")
 {
diff --git a/cudax/test/utility/ensure_current_device.cu b/cudax/test/utility/ensure_current_device.cu
index 89efc7d4f6..cdf8effcd6 100644
--- a/cudax/test/utility/ensure_current_device.cu
+++ b/cudax/test/utility/ensure_current_device.cu
@@ -13,7 +13,7 @@
 #include <cuda/experimental/event.cuh>
 #include <cuda/experimental/launch.cuh>
 
-#include "../common/utility.cuh"
+#include <utility.cuh>
 
 namespace driver = cuda::experimental::detail::driver;
 

From c9a7b6ad1b41b7b382ee6945ca7b44b687aa29b9 Mon Sep 17 00:00:00 2001
From: pciolkosz <pciolkosz@nvidia.com>
Date: Thu, 8 Aug 2024 00:41:04 -0700
Subject: [PATCH 10/33] [CUDAX] Add an API to get device_ref from stream and
 add comparison operator to device_ref (#2203)

* Add a way to compare device_refs

* Add a way to query device_ref from a stream

* Fix Windows missing cast

* Apply suggestions from code review

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>

* Disallow device comparision with int

---------

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .../cuda/experimental/__device/device.cuh     |  8 ++++++
 .../cuda/experimental/__device/device_ref.cuh | 28 +++++++++++++++++++
 .../cuda/experimental/__event/event_ref.cuh   |  8 +++---
 .../cuda/experimental/__stream/stream.cuh     | 13 +++++++++
 cudax/test/device/device_smoke.cu             | 16 +++++++++++
 cudax/test/stream/stream_smoke.cu             | 12 ++++++++
 6 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh
index 5532e8f59b..145ce4c10e 100644
--- a/cudax/include/cuda/experimental/__device/device.cuh
+++ b/cudax/include/cuda/experimental/__device/device.cuh
@@ -117,6 +117,14 @@ private:
   device(const device&)            = delete;
   device& operator=(device&&)      = delete;
   device& operator=(const device&) = delete;
+
+  friend bool operator==(const device& __lhs, int __rhs) = delete;
+  friend bool operator==(int __lhs, const device& __rhs) = delete;
+
+#if _CCCL_STD_VER <= 2017
+  friend bool operator!=(const device& __lhs, int __rhs) = delete;
+  friend bool operator!=(int __lhs, const device& __rhs) = delete;
+#endif // _CCCL_STD_VER <= 2017
 };
 
 namespace detail
diff --git a/cudax/include/cuda/experimental/__device/device_ref.cuh b/cudax/include/cuda/experimental/__device/device_ref.cuh
index 91e4e90caa..5c7b89779e 100644
--- a/cudax/include/cuda/experimental/__device/device_ref.cuh
+++ b/cudax/include/cuda/experimental/__device/device_ref.cuh
@@ -54,6 +54,34 @@ public:
     return __id_;
   }
 
+  //! @brief Compares two `device_ref`s for equality
+  //!
+  //! @note Allows comparison with `int` due to implicit conversion to
+  //! `device_ref`.
+  //!
+  //! @param __lhs The first `device_ref` to compare
+  //! @param __rhs The second `device_ref` to compare
+  //! @return true if `lhs` and `rhs` refer to the same device ordinal
+  _CCCL_NODISCARD_FRIEND constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept
+  {
+    return __lhs.__id_ == __rhs.__id_;
+  }
+
+#if _CCCL_STD_VER <= 2017
+  //! @brief Compares two `device_ref`s for inequality
+  //!
+  //! @note Allows comparison with `int` due to implicit conversion to
+  //! `device_ref`.
+  //!
+  //! @param __lhs The first `device_ref` to compare
+  //! @param __rhs The second `device_ref` to compare
+  //! @return true if `lhs` and `rhs` refer to different device ordinal
+  _CCCL_NODISCARD_FRIEND constexpr bool operator!=(device_ref __lhs, device_ref __rhs) noexcept
+  {
+    return __lhs.__id_ != __rhs.__id_;
+  }
+#endif // _CCCL_STD_VER <= 2017
+
   //! @brief Retrieve the specified attribute for the device
   //!
   //! @param __attr The attribute to query. See `device::attrs` for the available
diff --git a/cudax/include/cuda/experimental/__event/event_ref.cuh b/cudax/include/cuda/experimental/__event/event_ref.cuh
index 3b0ccc6dbc..bf1c1b398c 100644
--- a/cudax/include/cuda/experimental/__event/event_ref.cuh
+++ b/cudax/include/cuda/experimental/__event/event_ref.cuh
@@ -111,8 +111,8 @@ public:
   //! @note Allows comparison with `cudaEvent_t` due to implicit conversion to
   //! `event_ref`.
   //!
-  //! @param lhs The first `event_ref` to compare
-  //! @param rhs The second `event_ref` to compare
+  //! @param __lhs The first `event_ref` to compare
+  //! @param __rhs The second `event_ref` to compare
   //! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object.
   _CCCL_NODISCARD_FRIEND constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
   {
@@ -124,8 +124,8 @@ public:
   //! @note Allows comparison with `cudaEvent_t` due to implicit conversion to
   //! `event_ref`.
   //!
-  //! @param lhs The first `event_ref` to compare
-  //! @param rhs The second `event_ref` to compare
+  //! @param __lhs The first `event_ref` to compare
+  //! @param __rhs The second `event_ref` to compare
   //! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects.
   _CCCL_NODISCARD_FRIEND constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
   {
diff --git a/cudax/include/cuda/experimental/__stream/stream.cuh b/cudax/include/cuda/experimental/__stream/stream.cuh
index 0ba125269b..27f0f698db 100644
--- a/cudax/include/cuda/experimental/__stream/stream.cuh
+++ b/cudax/include/cuda/experimental/__stream/stream.cuh
@@ -161,6 +161,19 @@ struct stream : stream_ref
     wait(__tmp);
   }
 
+  //! @brief Get device under which this stream was created.
+  //!
+  //! @throws cuda_error if device check fails
+  device_ref device() const
+  {
+    // Because the stream can come from_native_handle, we can't just loop over devices comparing contexts,
+    // lower to CUDART for this instead
+    __ensure_current_device __dev_setter(*this);
+    int result;
+    _CCCL_TRY_CUDA_API(cudaGetDevice, "Could not get device from a stream", &result);
+    return result;
+  }
+
   //! @brief Construct an `stream` object from a native `cudaStream_t` handle.
   //!
   //! @param __handle The native handle
diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu
index d13400c8db..f725bc7f35 100644
--- a/cudax/test/device/device_smoke.cu
+++ b/cudax/test/device/device_smoke.cu
@@ -35,6 +35,16 @@ TEST_CASE("Smoke", "[device]")
   using cudax::device;
   using cudax::device_ref;
 
+  SECTION("Compare")
+  {
+    CUDAX_REQUIRE(device_ref{0} == device_ref{0});
+    CUDAX_REQUIRE(device_ref{0} == 0);
+    CUDAX_REQUIRE(0 == device_ref{0});
+    CUDAX_REQUIRE(device_ref{1} != device_ref{0});
+    CUDAX_REQUIRE(device_ref{1} != 2);
+    CUDAX_REQUIRE(1 != device_ref{2});
+  }
+
   SECTION("Attributes")
   {
     ::test_device_attribute<device::attrs::max_threads_per_block, ::cudaDevAttrMaxThreadsPerBlock, int>();
@@ -272,13 +282,19 @@ TEST_CASE("global devices vector", "[device]")
   CUDAX_REQUIRE(cudax::devices.size() == static_cast<size_t>(cudax::devices.end() - cudax::devices.begin()));
 
   CUDAX_REQUIRE(0 == cudax::devices[0].get());
+  CUDAX_REQUIRE(cudax::device_ref{0} == cudax::devices[0]);
+
   CUDAX_REQUIRE(0 == (*cudax::devices.begin()).get());
+  CUDAX_REQUIRE(cudax::device_ref{0} == *cudax::devices.begin());
+
   CUDAX_REQUIRE(0 == cudax::devices.begin()->get());
   CUDAX_REQUIRE(0 == cudax::devices.begin()[0].get());
 
   if (cudax::devices.size() > 1)
   {
     CUDAX_REQUIRE(1 == cudax::devices[1].get());
+    CUDAX_REQUIRE(cudax::device_ref{0} != cudax::devices[1].get());
+
     CUDAX_REQUIRE(1 == (*std::next(cudax::devices.begin())).get());
     CUDAX_REQUIRE(1 == std::next(cudax::devices.begin())->get());
     CUDAX_REQUIRE(1 == cudax::devices.begin()[1].get());
diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu
index 50e55352a4..90d7743810 100644
--- a/cudax/test/stream/stream_smoke.cu
+++ b/cudax/test/stream/stream_smoke.cu
@@ -102,3 +102,15 @@ TEST_CASE("Stream priority", "[stream]")
   cudax::stream stream(0, priority);
   CUDAX_REQUIRE(stream.priority() == priority);
 }
+
+TEST_CASE("Stream get device", "[stream]")
+{
+  cudax::stream dev0_stream(cudax::device_ref{0});
+  CUDAX_REQUIRE(dev0_stream.device() == 0);
+
+  cudaSetDevice(static_cast<int>(cudax::devices.size() - 1));
+  cudaStream_t stream_handle;
+  CUDART(cudaStreamCreate(&stream_handle));
+  auto stream_cudart = cudax::stream::from_native_handle(stream_handle);
+  CUDAX_REQUIRE(stream_cudart.device() == *std::prev(cudax::devices.end()));
+}

From 3ebf8cc873500d5e22ab0a8957b2baa32035f583 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Thu, 8 Aug 2024 11:52:11 -0500
Subject: [PATCH 11/33] Update devcontainer docs for WSL  (#2200)

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* [pre-commit.ci] auto code formatting

* Why was 6 afraid of 7?

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .devcontainer/README.md | 209 +++++++++++++++++++++++-----------------
 1 file changed, 121 insertions(+), 88 deletions(-)

diff --git a/.devcontainer/README.md b/.devcontainer/README.md
index d854931292..17486a4065 100644
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -1,26 +1,32 @@
 > **Note**
-> The instructions in this README are specific to Linux development environments. Instructions for Windows are coming soon!
+> The instructions in this README are specific to Linux development environments (including WSL on Windows). Instructions for native Windows development (e.g., `msvc`) are coming soon!
 
 [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
 
 # CCCL Dev Containers
 
-CCCL uses [Development Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. This guide covers setup in [Visual Studio Code](#quickstart-vscode-recommended) and [Docker](#quickstart-docker-manual-approach). The guide also provides additional instructions in case you want use WSL.
+CCCL uses [Dev Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI.
 
-## Table of Contents
-1. [Quickstart: VSCode (Recommended)](#vscode)
-2. [Quickstart: Docker (Manual Approach)](#docker)
-3. [Quickstart: Using WSL](#wsl)
+VSCode offers the most convenient experience with Dev Containers due to its tight native integration, however, our containers are also fully usable without VSCode by leveraging Docker directly.
 
-## Quickstart: VSCode (Recommended) <a name="vscode"></a>
+## Table of Contents
+1. [Quickstart: VSCode on Linux (Recommended)](#vscode)
+2. [Quickstart: VSCode on WSL (Recommended for Windows)](#wsl)
+3. [Quickstart: Docker on Linux (Manual Approach)](#docker)
 
+## Quickstart: VSCode on Linux (Recommended) <a name="vscode"></a>
 ### Prerequisites
 - [Visual Studio Code](https://code.visualstudio.com/)
 - [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
 - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
 - [Docker](https://docs.docker.com/engine/install/) - This is only for completeness because it should already be implicitly installed by the Dev Containers extension
 
-### Steps
+#### GPU Prerequisites (only needed for executing tests that require a GPU)
+- Supported NVIDIA GPU
+- [NVIDIA Driver](https://www.nvidia.com/Download/index.aspx?lang=en-us)
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+
+### Steps <a name="vscode-devcontainer-steps"></a>
 
 1. Clone the Repository
     ```bash
@@ -32,7 +38,7 @@ CCCL uses [Development Containers](https://containers.dev/) to provide consisten
 
    ![Shows "Reopen in Container" prompt when opening the cccl directory in VScode.](./img/reopen_in_container.png)
 
-   - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.
+   - Alternatively, use `ctrl+shift+p` to open the Command Palette and type "Remote-Containers: Reopen in Container" and select it.
 
      ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png)
 
@@ -42,11 +48,14 @@ CCCL uses [Development Containers](https://containers.dev/) to provide consisten
 
 5. VSCode will initialize the selected Dev Container. This can take a few minutes the first time.
 
-6. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent.
+6. (Optional) Authenticate with GitHub
+   - After container startup, you will be asked if you would like to authenticate with GitHub. This is for access to CCCL's distributed `sccache` storage. If you are not an NVIDIA employee, you can safely ignore this step. For more information, see the [`sccache`](#sccache) section below.
 
-7. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+7. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent.
 
-### (Optional) Authenticate with GitHub for `sccache`
+8. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+
+### (Optional) Authenticate with GitHub for `sccache` <a name="sccache"></a>
 
 After starting the container, there will be a prompt to authenticate with GitHub. This grants access to a [`sccache`](https://github.com/mozilla/sccache) server shared with CI and greatly accelerates local build times. This is currently limited to NVIDIA employees belonging to the `NVIDIA` or `rapidsai` GitHub organizations.
 
@@ -60,11 +69,110 @@ To manually trigger this authentication, execute the `devcontainer-utils-vault-s
 
 For more information about the sccache configuration and authentication, see the documentation at [`rapidsai/devcontainers`](https://github.com/rapidsai/devcontainers/blob/branch-23.10/USAGE.md#build-caching-with-sccache).
 
+## Quickstart: VSCode on WSL (Recommended for Windows) <a name="wsl"></a>
+
+Windows Subsystem for Linux (WSL) enables you to run a Linux environment directly in Windows.
+This isn't for native Windows development (e.g., compiling with `msvc`), but effectively a more convenient option than setting up a dual-boot Linux/Windows machine.
+Apart from the initial setup of WSL, the process for using CCCL's Dev Containers in WSL is effectively the same as the instructions for Linux, because WSL _is_ Linux.
+
+### Prerequisites
+- Windows OS that supports WSL 2 (Windows 11 or newer)
+- [Windows Subsystem for Linux v2 (WSL 2)](https://learn.microsoft.com/en-us/windows/wsl/install)
+- [Visual Studio Code](https://code.visualstudio.com/) (installed on Windows host)
+- [VSCode Remote Development Extension Pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) (installed on Windows host)
+    - Includes [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) and [WSL](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-wsl) extensions
+- [Docker](https://docs.docker.com/engine/install/) - (Will be installed automatically by the Remote Development extension)
+
+#### GPU Prerequisites (only needed for executing tests that require a GPU)
+- Supported NVIDIA GPU
+- [NVIDIA Driver](https://www.nvidia.com/Download/index.aspx?lang=en-us) (installed on Windows host)
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) (**installed inside WSL**)
+
+For more details see the official NVIDIA [Getting Started with CUDA on WSL guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html#getting-started-with-cuda-on-wsl-2).
+
+### Install WSL on your Windows host
+Refer to [Microsoft's documentation](https://learn.microsoft.com/en-us/windows/wsl/install) for the full instructions to install WSL2.
+
+<details>
+  <summary>Click here for the TL;DR version</summary>
+1. Run `Powershell` as an administrator
+![image](https://github.com/user-attachments/assets/2c985887-ca6c-46bc-9e1b-f235ccfd8513)
+
+2. Install WSL 2 by running:
+```bash
+> wsl --install
+```
+3. Restart your computer
+4. If this is your first time installing WSL, upon restarting, it will prompt you to create a username/password to use inside WSL.
+5. Verify `wsl` was succesfully installed by opening Powershell again and run
+```bash
+> wsl -l -v
+  NAME      STATE           VERSION
+* Ubuntu    Running         2
+```
+5. Launch `wsl` and verify your Linux environment
+```
+# In Powershell, start WSL, which will drop you into a terminal session running in Linux
+> wsl
+
+# In the new terminal session, verify your Linux environment by changing to your home directory
+# and displaying the current directory. This should show `/home/*YOUR USER NAME*`
+> cd ~
+> pwd
+/home/jhemstad
+```
+
+Congratulations! You now have WSL installed and can use it as you would a normal Ubuntu/Linux installation.
+This is sufficient for *building* CCCL's tests, if you have a GPU on your system and you would like to use it to run the tests, continue below:
+
+6. (Optional) Install `nvidia-container-toolkit`
+See [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) for full instructions.
+
+**Important:** `nvidia-container-toolkit` needs to be installed inside WSL (not on the Windows host). The following commands should be run within the Linux environment.
+
+```bash
+$ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+  && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+    sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+$ sudo apt-get update
+$ sudo apt-get install -y nvidia-container-toolkit
+```
+
+Then configure Docker to use the `nvidia-container-toolkit`:
+```bash
+$ sudo nvidia-ctk runtime configure --runtime=docker
+$ sudo systemctl restart docker
+```
+
+7. (Optional) Verify your GPU is available inside WSL
+Use `nvidia-smi` inside of WSL to verify that your GPU is correctly configured and available from inside the container.
+If not, verify that the NVIDIA GPU driver is correctly installed on your Windows host and `nvidia-container-toolkit` was successfully installed inside of WSL.
+```bash
+$ nvidia-smi
+```
+</details>
+
+### Connect VSCode to WSL
+1. Launch VSCode on your Windows host
+
+2. Connect VSCode to your WSL instance
+- Enter `ctrl + shift + p` to open the command prompt and type "WSL" and click "WSL: Connect to WSL"
+    - If you don't see this option, you need to install the [WSL VSCode Extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-wsl) (comes with the [Remote Development pack ](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack))
+![image](https://github.com/user-attachments/assets/3e0e6af7-4251-4ce9-9204-589ad7daa12a)
+    - To verify VSCode is connected to WSL, you should see the following in the bottom left corner: ![Shows the WSL: Ubuntu status for a successful connection to WSL.](https://github.com/user-attachments/assets/26dbba61-cc96-4ac3-8200-fdb26a8e4a4b)
+
+3. VSCode is now attached to WSL and it is equivalent to running in a native Linux environment. You can now proceed as described in the [section above](#vscode-devcontainer-steps).
+
 ## Quickstart: Docker (Manual Approach) <a name="docker"></a>
 
 ### Prerequisites
+- [Docker](https://docs.docker.com/engine/install/)
+
+#### GPU Prerequisites (only needed for executing tests that require a GPU)
+- Supported NVIDIA GPU
+- [NVIDIA Driver](https://www.nvidia.com/Download/index.aspx?lang=en-us)
 - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
-- [Docker](https://docs.docker.com/desktop/install/linux-install/)
 
 ### Steps
 1. Clone the repository and use the [`launch.sh`](./launch.sh) script to launch the default container environment
@@ -122,78 +230,3 @@ Click the badge above or [click here](https://codespaces.new/NVIDIA/cccl?quickst
 For more information, see the `.devcontainer/make_devcontainers.sh --help` message.
 
 **Note**: When adding or updating supported environments, modify `matrix.yaml` and then rerun this script to synchronize the `devcontainer` configurations.
-
-## Quickstart: Using WSL <a name="wsl"></a>
-
-> [!NOTE]
-> _Make sure you have the Nvidia driver installed on your Windows host before moving further_. Type in `nvidia-smi` for verification.
-
-### Install WSL on your Windows host
-
-> [!WARNING]
-> Disclaimer: This guide was developed for WSL 2 on Windows 11.
-
-1. Launch a Windows terminal (_e.g. Powershell_) as an administrator.
-
-2. Install WSL 2 by running:
-```bash
-wsl --install
-```
-This should probably install Ubuntu distro as a default.
-
-3. Restart your computer and run `wsl -l -v` on a Windows terminal to verify installation.
-
-<h3 id="prereqs"> Install prerequisites and VS Code extensions</h3>
-
-4. Launch your WSL/Ubuntu terminal by running `wsl` in Powershell.
-
-5. Install the [WSL extension](ms-vscode-remote.remote-wsl) on VS Code.
-
-    - `Ctrl + Shift + P` and select `WSL: Connect to WSL` (it will prompt you to install the WSL extension).
-
-    - Make sure you are connected to WSL with VS Code by checking the bottom left corner of the VS Code window (should indicate "WSL: Ubuntu" in our case).
-
-6. Install the [Dev Containers extension](ms-vscode-remote.remote-containers) on VS Code.
-
-    - In a vanilla system you should be prompted to install `Docker` at this point, accept it. If it hangs you might have to restart VS Code after that.
-
-7. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). **Make sure you install the WSL 2 version and not the native Linux one**. This builds on top of Docker so make sure you have Docker properly installed (run `docker --version`).
-
-8. Open `/etc/docker/daemon.json` from within your WSL system (if the file does not exist, create it) and add the following:
-
-```json
-{
-    "runtimes": {
-        "nvidia": {
-            "path": "nvidia-container-runtime",
-            "runtimeArgs": []
-        }
-    }
-}
-```
-
-then run `sudo systemctl restart docker.service`.
-
----
-### Build CCCL in WSL using Dev Containers
-
-9. Still on your WSL terminal run `git clone https://github.com/NVIDIA/cccl.git`
-
-
-10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).
-
-11. If prompted, choose `Reopen in Container`.
-
-    - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.
-
-12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.
-
-From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:
-
-13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message.
-
-> Failed opening a web browser at https://github.com/login/device
-  exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
-  Please try entering the URL in your browser manually
-
-In that case type in the address manually in your web browser https://github.com/login/device and fill in the one-time code.

From f95f2113c32cc228df165137ad62743a180731f0 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Thu, 8 Aug 2024 17:33:11 -0700
Subject: [PATCH 12/33] add `cudax::distribute<threadsPrBlock>(numElements)` as
 a way to evenly distribute elements over thread blocks (#2210)

---
 .../__hierarchy/hierarchy_dimensions.cuh      | 26 +++++++++++++++++++
 cudax/test/hierarchy/hierarchy_smoke.cu       | 10 +++++++
 2 files changed, 36 insertions(+)

diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
index 48d4b38b1d..3cbad5f0a9 100644
--- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
@@ -839,6 +839,32 @@ constexpr auto hierarchy_add_level(const hierarchy_dimensions_fragment<Unit, Lev
   return hierarchy & ::cuda::std::forward<NewLevel>(level);
 }
 
+/**
+ * @brief A shorthand for creating a hierarchy of CUDA threads by evenly
+ * distributing elements among blocks and threads.
+ *
+ * @par Snippet
+ * @code
+ * #include <cudax/hierarchy_dimensions.cuh>
+ * using namespace cuda::experimental;
+ *
+ * constexpr int threadsPerBlock = 256;
+ * auto dims = distribute<threadsPerBlock>(numElements);
+ *
+ * // Equivalent to:
+ * constexpr int threadsPerBlock = 256;
+ * int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+ * auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
+ * @endcode
+ */
+template <int _ThreadsPerBlock>
+constexpr auto distribute(int numElements) noexcept
+{
+  int blocksPerGrid = (numElements + _ThreadsPerBlock - 1) / _ThreadsPerBlock;
+  return ::cuda::experimental::make_hierarchy(
+    ::cuda::experimental::grid_dims(blocksPerGrid), ::cuda::experimental::block_dims<_ThreadsPerBlock>());
+}
+
 } // namespace cuda::experimental
 #endif // _CCCL_STD_VER >= 2017
 #endif // _CUDAX__HIERARCHY_HIERARCHY_DIMENSIONS
diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu
index b43a077b79..fc78ca4504 100644
--- a/cudax/test/hierarchy/hierarchy_smoke.cu
+++ b/cudax/test/hierarchy/hierarchy_smoke.cu
@@ -512,3 +512,13 @@ TEST_CASE("Trivially constructable", "[hierarchy]")
   // static_assert(std::is_trivially_copyable_v<decltype(cudax::std::make_tuple(cudax::block_dims<256>(),
   // cudax::grid_dims<256>()))>);
 }
+
+TEST_CASE("cudax::distribute", "[hierarchy]")
+{
+  int numElements               = 50000;
+  constexpr int threadsPerBlock = 256;
+  auto dims                     = cudax::distribute<threadsPerBlock>(numElements);
+
+  CUDAX_REQUIRE(dims.count(cudax::thread, cudax::block) == 256);
+  CUDAX_REQUIRE(dims.count(cudax::block, cudax::grid) == (numElements + threadsPerBlock - 1) / threadsPerBlock);
+}

From 8e20c9a3cde9c725df40a91f83ea2ab66f5d40a3 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 9 Aug 2024 17:14:51 +0200
Subject: [PATCH 13/33] Rework mdspan concept emulation (#2213)

It is proving difficult to handle for msvc and also the one we are using in libcu++ it much cleaner

Gets #2160 compiling on MSVC
---
 .../cuda/std/__mdspan/default_accessor.h      |   7 +-
 .../include/cuda/std/__mdspan/extents.h       |  95 +++++-----
 .../include/cuda/std/__mdspan/layout_left.h   |  27 ++-
 .../include/cuda/std/__mdspan/layout_right.h  |  27 ++-
 .../include/cuda/std/__mdspan/layout_stride.h | 111 ++++++------
 libcudacxx/include/cuda/std/__mdspan/macros.h | 162 +-----------------
 libcudacxx/include/cuda/std/__mdspan/mdspan.h | 160 +++++++----------
 .../include/cuda/std/__mdspan/static_array.h  |  20 +--
 .../include/cuda/std/__mdspan/submdspan.h     |  22 +--
 .../views/mdspan/foo_customizations.hpp       |  22 +--
 10 files changed, 216 insertions(+), 437 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__mdspan/default_accessor.h b/libcudacxx/include/cuda/std/__mdspan/default_accessor.h
index 33bef7cb07..ea0924915d 100644
--- a/libcudacxx/include/cuda/std/__mdspan/default_accessor.h
+++ b/libcudacxx/include/cuda/std/__mdspan/default_accessor.h
@@ -72,10 +72,9 @@ struct default_accessor
 
   __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr default_accessor() noexcept = default;
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _OtherElementType,
-                             /* requires */ (_CCCL_TRAIT(is_convertible, _OtherElementType (*)[], element_type (*)[])))
-  __MDSPAN_INLINE_FUNCTION
-  constexpr default_accessor(default_accessor<_OtherElementType>) noexcept {}
+  _LIBCUDACXX_TEMPLATE(class _OtherElementType)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _OtherElementType (*)[], element_type (*)[]))
+  __MDSPAN_INLINE_FUNCTION constexpr default_accessor(default_accessor<_OtherElementType>) noexcept {}
 
   __MDSPAN_INLINE_FUNCTION
   constexpr data_handle_type offset(data_handle_type __p, size_t __i) const noexcept
diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h
index 58ab181afc..c8177542da 100644
--- a/libcudacxx/include/cuda/std/__mdspan/extents.h
+++ b/libcudacxx/include/cuda/std/__mdspan/extents.h
@@ -248,16 +248,13 @@ class extents
   __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr extents() noexcept = default;
 
   // Converting constructor
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _OtherIndexType,
-    size_t... _OtherExtents,
-    /* requires */
-    (
-      /* multi-stage check to protect from invalid pack expansion when sizes don't match? */
-      decltype(__detail::__check_compatible_extents(
-        integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
-        _CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
-        _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value))
+  _LIBCUDACXX_TEMPLATE(class _OtherIndexType, size_t... _OtherExtents)
+  _LIBCUDACXX_REQUIRES(
+    /* multi-stage check to protect from invalid pack expansion when sizes don't match? */
+    (decltype(__detail::__check_compatible_extents(
+      integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
+      _CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
+      _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value))
   __MDSPAN_INLINE_FUNCTION
   __MDSPAN_CONDITIONAL_EXPLICIT(
     (((_Extents != dynamic_extent) && (_OtherExtents == dynamic_extent)) || ...)
@@ -287,23 +284,23 @@ class extents
   }
 
 #  ifdef __NVCC__
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class... _Integral,
-    /* requires */ (
-      // TODO: check whether the other version works with newest NVCC, doesn't with 11.4
-      // NVCC seems to pick up rank_dynamic from the wrong extents type???
-      __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */)
-      && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) &&
-      // NVCC chokes on the fold thingy here so wrote the workaround
-      ((sizeof...(_Integral) == __detail::__count_dynamic_extents<_Extents...>::val)
-       || (sizeof...(_Integral) == sizeof...(_Extents)))))
+  _LIBCUDACXX_TEMPLATE(class... _Integral)
+  _LIBCUDACXX_REQUIRES(
+    // TODO: check whether the other version works with newest NVCC, doesn't with 11.4
+    // NVCC seems to pick up rank_dynamic from the wrong extents type???
+    __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */)
+      _LIBCUDACXX_AND __MDSPAN_FOLD_AND(
+        _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) _LIBCUDACXX_AND
+    // NVCC chokes on the fold thingy here so wrote the workaround
+    ((sizeof...(_Integral) == __detail::__count_dynamic_extents<_Extents...>::val)
+     || (sizeof...(_Integral) == sizeof...(_Extents))))
 #  else
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class... _Integral,
-    /* requires */ (
-      __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */)
-      && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */)
-      && ((sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank()))))
+  _LIBCUDACXX_TEMPLATE(class... _Integral)
+  _LIBCUDACXX_REQUIRES(
+    __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */)
+      _LIBCUDACXX_AND __MDSPAN_FOLD_AND(
+        _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */)
+        _LIBCUDACXX_AND((sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank())))
 #  endif
   __MDSPAN_INLINE_FUNCTION
   explicit constexpr extents(_Integral... __exts) noexcept
@@ -337,21 +334,16 @@ class extents
 #  ifdef __NVCC__
   // NVCC seems to pick up rank_dynamic from the wrong extents type???
   // NVCC chokes on the fold thingy here so wrote the workaround
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _IndexType,
-    size_t _Np,
-    /* requires */
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
-     && ((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents)))))
+  _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np)
+  _LIBCUDACXX_REQUIRES(
+    _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
+      _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
+        _LIBCUDACXX_AND((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents))))
 #  else
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _IndexType,
-    size_t _Np,
-    /* requires */
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
-     && (_Np == rank() || _Np == rank_dynamic())))
+  _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
+                         _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
+                           _LIBCUDACXX_AND(_Np == rank() || _Np == rank_dynamic()))
 #  endif
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
   __MDSPAN_INLINE_FUNCTION
@@ -386,21 +378,16 @@ class extents
 #  ifdef __NVCC__
   // NVCC seems to pick up rank_dynamic from the wrong extents type???
   // NVCC chokes on the fold thingy here so wrote the workaround
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _IndexType,
-    size_t _Np,
-    /* requires */
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
-     && ((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents)))))
+  _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np)
+  _LIBCUDACXX_REQUIRES(
+    _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
+      _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
+        _LIBCUDACXX_AND((_Np == __detail::__count_dynamic_extents<_Extents...>::val) || (_Np == sizeof...(_Extents))))
 #  else
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _IndexType,
-    size_t _Np,
-    /* requires */
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
-     && (_Np == rank() || _Np == rank_dynamic())))
+  _LIBCUDACXX_TEMPLATE(class _IndexType, size_t _Np)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _IndexType, index_type)
+                         _LIBCUDACXX_AND _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _IndexType)
+                           _LIBCUDACXX_AND(_Np == rank() || _Np == rank_dynamic()))
 #  endif
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
   __MDSPAN_INLINE_FUNCTION
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_left.h b/libcudacxx/include/cuda/std/__mdspan/layout_left.h
index b0cdde455c..9d0842515c 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_left.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_left.h
@@ -121,8 +121,8 @@ class layout_left::mapping
       : __extents(__exts)
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)))
+  _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
@@ -135,9 +135,9 @@ class layout_left::mapping
      */
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)
-                                             && (extents_type::rank() <= 1)))
+  _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)
+                         _LIBCUDACXX_AND(extents_type::rank() <= 1))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
@@ -150,8 +150,8 @@ class layout_left::mapping
      */
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)))
+  _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
     layout_stride::mapping<_OtherExtents> const& __other) // NOLINT(google-explicit-constructor)
@@ -190,11 +190,10 @@ class layout_left::mapping
 
   //--------------------------------------------------------------------------------
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class... _Indices,
-    /* requires */ ((sizeof...(_Indices) == extents_type::rank())
-                    && __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)
-                                          && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)))))
+  _LIBCUDACXX_TEMPLATE(class... _Indices)
+  _LIBCUDACXX_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _LIBCUDACXX_AND __MDSPAN_FOLD_AND(
+    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)
+     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices))))
   _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept
   {
     // Immediately cast incoming indices to `index_type`
@@ -227,8 +226,8 @@ class layout_left::mapping
     return true;
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _Ext = _Extents,
-                             /* requires */ (_Ext::rank() > 0))
+  _LIBCUDACXX_TEMPLATE(class _Ext = _Extents)
+  _LIBCUDACXX_REQUIRES((_Ext::rank() > 0))
   __MDSPAN_INLINE_FUNCTION
   constexpr index_type stride(rank_type __i) const noexcept
   {
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_right.h b/libcudacxx/include/cuda/std/__mdspan/layout_right.h
index efe215f114..4dfd4a1e38 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_right.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_right.h
@@ -126,8 +126,8 @@ class layout_right::mapping
       : __extents(__exts)
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)))
+  _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
@@ -140,9 +140,9 @@ class layout_right::mapping
      */
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)
-                                             && (extents_type::rank() <= 1)))
+  _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)
+                         _LIBCUDACXX_AND(extents_type::rank() <= 1))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
@@ -155,8 +155,8 @@ class layout_right::mapping
      */
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents)))
+  _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
     layout_stride::mapping<_OtherExtents> const& __other) // NOLINT(google-explicit-constructor)
@@ -195,11 +195,10 @@ class layout_right::mapping
 
   //--------------------------------------------------------------------------------
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class... _Indices,
-    /* requires */ ((sizeof...(_Indices) == extents_type::rank())
-                    && __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)
-                                          && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)))))
+  _LIBCUDACXX_TEMPLATE(class... _Indices)
+  _LIBCUDACXX_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _LIBCUDACXX_AND __MDSPAN_FOLD_AND(
+    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)
+     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices))))
   _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept
   {
     return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(__idxs)...);
@@ -230,8 +229,8 @@ class layout_right::mapping
     return true;
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(class _Ext = _Extents,
-                             /* requires */ (_Ext::rank() > 0))
+  _LIBCUDACXX_TEMPLATE(class _Ext = _Extents)
+  _LIBCUDACXX_REQUIRES((_Ext::rank() > 0))
   __MDSPAN_INLINE_FUNCTION
   constexpr index_type stride(rank_type __i) const noexcept
   {
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
index 1818adff8a..d0a1ecad4b 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
@@ -93,8 +93,8 @@ struct layout_right
 namespace __detail
 {
 template <class _Layout, class _Mapping>
-constexpr bool __is_mapping_of =
-  _CUDA_VSTD::is_same<typename _Layout::template mapping<typename _Mapping::extents_type>, _Mapping>::value;
+_LIBCUDACXX_INLINE_VAR constexpr bool __is_mapping_of =
+  is_same<typename _Layout::template mapping<typename _Mapping::extents_type>, _Mapping>::value;
 
 #  if __MDSPAN_USE_CONCEPTS && __MDSPAN_HAS_CXX_20
 template <class _Mp>
@@ -298,17 +298,13 @@ struct layout_stride
     __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept               = default;
     __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
 
-    __MDSPAN_TEMPLATE_REQUIRES(
+    // nvcc cannot deduce this constructor when using _LIBCUDACXX_REQUIRES
+    template <
       class _IntegralTypes,
-      /* requires */ (
-        // MSVC 19.32 does not like using index_type here, requires the typename _Extents::index_type
-        // error C2641: cannot deduce template arguments for '_CUDA_VSTD::layout_stride::mapping'
-        _CCCL_TRAIT(_CUDA_VSTD::is_convertible, const remove_const_t<_IntegralTypes>&, typename _Extents::index_type)
-        && _CCCL_TRAIT(
-          _CUDA_VSTD::is_nothrow_constructible, typename _Extents::index_type, const remove_const_t<_IntegralTypes>&)))
-    __MDSPAN_INLINE_FUNCTION
-    constexpr mapping(extents_type const& __e,
-                      _CUDA_VSTD::array<_IntegralTypes, extents_type::rank()> const& __s) noexcept
+      enable_if_t<_CCCL_TRAIT(is_convertible, const remove_const_t<_IntegralTypes>&, index_type), int>           = 0,
+      enable_if_t<_CCCL_TRAIT(is_nothrow_constructible, index_type, const remove_const_t<_IntegralTypes>&), int> = 0>
+    __MDSPAN_INLINE_FUNCTION constexpr mapping(
+      extents_type const& __e, _CUDA_VSTD::array<_IntegralTypes, extents_type::rank()> const& __s) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
         : __members{
 #  else
@@ -331,17 +327,13 @@ struct layout_stride
        */
     }
 
-    __MDSPAN_TEMPLATE_REQUIRES(
+    // nvcc cannot deduce this constructor when using _LIBCUDACXX_REQUIRES
+    template <
       class _IntegralTypes,
-      /* requires */ (
-        // MSVC 19.32 does not like using index_type here, requires the typename _Extents::index_type
-        // error C2641: cannot deduce template arguments for '_CUDA_VSTD::layout_stride::mapping'
-        _CCCL_TRAIT(_CUDA_VSTD::is_convertible, const remove_const_t<_IntegralTypes>&, typename _Extents::index_type)
-        && _CCCL_TRAIT(
-          _CUDA_VSTD::is_nothrow_constructible, typename _Extents::index_type, const remove_const_t<_IntegralTypes>&)))
-    __MDSPAN_INLINE_FUNCTION
-    constexpr mapping(extents_type const& __e,
-                      _CUDA_VSTD::span<_IntegralTypes, extents_type::rank()> const& __s) noexcept
+      enable_if_t<_CCCL_TRAIT(is_convertible, const remove_const_t<_IntegralTypes>&, index_type), int>           = 0,
+      enable_if_t<_CCCL_TRAIT(is_nothrow_constructible, index_type, const remove_const_t<_IntegralTypes>&), int> = 0>
+    __MDSPAN_INLINE_FUNCTION constexpr mapping(
+      extents_type const& __e, _CUDA_VSTD::span<_IntegralTypes, extents_type::rank()> const& __s) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
         : __members{
 #  else
@@ -365,25 +357,25 @@ struct layout_stride
     }
 
 #  if !(__MDSPAN_USE_CONCEPTS && __MDSPAN_HAS_CXX_20)
-    __MDSPAN_TEMPLATE_REQUIRES(
-      class _StridedLayoutMapping,
-      /* requires */ (
-        _CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, typename _StridedLayoutMapping::extents_type)
-        && __detail::__is_mapping_of<typename _StridedLayoutMapping::layout_type, _StridedLayoutMapping>
-        && _StridedLayoutMapping::is_always_unique() && _StridedLayoutMapping::is_always_strided()))
+    _LIBCUDACXX_TEMPLATE(class _StridedLayoutMapping)
+    _LIBCUDACXX_REQUIRES(
+      _CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, typename _StridedLayoutMapping::extents_type)
+        _LIBCUDACXX_AND __detail::__is_mapping_of<typename _StridedLayoutMapping::layout_type, _StridedLayoutMapping>
+          _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_unique())
+            _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided()))
 #  else
     template <class _StridedLayoutMapping>
       requires(__detail::__layout_mapping_alike<_StridedLayoutMapping>
-               && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, typename _StridedLayoutMapping::extents_type)
+               && _CCCL_TRAIT(is_constructible, extents_type, typename _StridedLayoutMapping::extents_type)
                && _StridedLayoutMapping::is_always_unique() && _StridedLayoutMapping::is_always_strided())
 #  endif
     __MDSPAN_CONDITIONAL_EXPLICIT(
-      (!_CUDA_VSTD::is_convertible<typename _StridedLayoutMapping::extents_type, extents_type>::value)
+      (!is_convertible<typename _StridedLayoutMapping::extents_type, extents_type>::value)
       && (__detail::__is_mapping_of<layout_left, _StridedLayoutMapping>
           || __detail::__is_mapping_of<layout_right, _StridedLayoutMapping>
           || __detail::__is_mapping_of<layout_stride, _StridedLayoutMapping>) ) // needs two () due to comma
-    __MDSPAN_INLINE_FUNCTION constexpr mapping(
-      _StridedLayoutMapping const& __other) noexcept // NOLINT(google-explicit-constructor)
+    __MDSPAN_INLINE_FUNCTION
+    constexpr mapping(_StridedLayoutMapping const& __other) noexcept // NOLINT(google-explicit-constructor)
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
         : __members{
 #  else
@@ -440,12 +432,11 @@ struct layout_stride
       return __span_size;
     }
 
-    __MDSPAN_TEMPLATE_REQUIRES(
-      class... _Indices,
-      /* requires */ (
-        sizeof...(_Indices) == _Extents::rank()
-        && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) /*&& ...*/)
-        && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices) /*&& ...*/)))
+    _LIBCUDACXX_TEMPLATE(class... _Indices)
+    _LIBCUDACXX_REQUIRES(
+      (sizeof...(_Indices) == _Extents::rank())
+        _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _Indices, index_type) /*&& ...*/)
+          _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _Indices) /*&& ...*/))
     __MDSPAN_FORCE_INLINE_FUNCTION
     constexpr index_type operator()(_Indices... __idxs) const noexcept
     {
@@ -480,8 +471,8 @@ struct layout_stride
       return true;
     }
 
-    __MDSPAN_TEMPLATE_REQUIRES(class _Ext = _Extents,
-                               /* requires */ (_Ext::rank() > 0))
+    _LIBCUDACXX_TEMPLATE(class _Ext = _Extents)
+    _LIBCUDACXX_REQUIRES((_Ext::rank() > 0))
     __MDSPAN_INLINE_FUNCTION
     constexpr index_type stride(rank_type __r) const noexcept
     {
@@ -489,11 +480,11 @@ struct layout_stride
     }
 
 #  if !(__MDSPAN_USE_CONCEPTS && __MDSPAN_HAS_CXX_20)
-    __MDSPAN_TEMPLATE_REQUIRES(
-      class _StridedLayoutMapping,
-      /* requires */ (__detail::__is_mapping_of<typename _StridedLayoutMapping::layout_type, _StridedLayoutMapping>
-                      && (extents_type::rank() == _StridedLayoutMapping::extents_type::rank())
-                      && _StridedLayoutMapping::is_always_strided()))
+    _LIBCUDACXX_TEMPLATE(class _StridedLayoutMapping)
+    _LIBCUDACXX_REQUIRES(
+      __detail::__is_mapping_of<typename _StridedLayoutMapping::layout_type, _StridedLayoutMapping> _LIBCUDACXX_AND(
+        extents_type::rank() == _StridedLayoutMapping::extents_type::rank())
+        _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided()))
 #  else
     template<class _StridedLayoutMapping>
     requires(
@@ -515,30 +506,30 @@ struct layout_stride
     }
 
     // This one is not technically part of the proposal. Just here to make implementation a bit more optimal hopefully
-    __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                               /* requires */ ((extents_type::rank() == _OtherExtents::rank())))
-    __MDSPAN_INLINE_FUNCTION
-    friend constexpr bool operator==(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
+    _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+    _LIBCUDACXX_REQUIRES((extents_type::rank() == _OtherExtents::rank()))
+    __MDSPAN_INLINE_FUNCTION friend constexpr bool
+    operator==(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
     {
       return __impl::_eq_impl(__lhs, __rhs);
     }
 
 #  if !__MDSPAN_HAS_CXX_20
-    __MDSPAN_TEMPLATE_REQUIRES(
-      class _StridedLayoutMapping,
-      /* requires */ (__detail::__is_mapping_of<typename _StridedLayoutMapping::layout_type, _StridedLayoutMapping>
-                      && (extents_type::rank() == _StridedLayoutMapping::extents_type::rank())
-                      && _StridedLayoutMapping::is_always_strided()))
-    __MDSPAN_INLINE_FUNCTION
-    friend constexpr bool operator!=(const mapping& __x, const _StridedLayoutMapping& __y) noexcept
+    _LIBCUDACXX_TEMPLATE(class _StridedLayoutMapping)
+    _LIBCUDACXX_REQUIRES(
+      __detail::__is_mapping_of<typename _StridedLayoutMapping::layout_type, _StridedLayoutMapping> _LIBCUDACXX_AND(
+        extents_type::rank() == _StridedLayoutMapping::extents_type::rank())
+        _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided()))
+    __MDSPAN_INLINE_FUNCTION friend constexpr bool
+    operator!=(const mapping& __x, const _StridedLayoutMapping& __y) noexcept
     {
       return not(__x == __y);
     }
 
-    __MDSPAN_TEMPLATE_REQUIRES(class _OtherExtents,
-                               /* requires */ ((extents_type::rank() == _OtherExtents::rank())))
-    __MDSPAN_INLINE_FUNCTION
-    friend constexpr bool operator!=(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
+    _LIBCUDACXX_TEMPLATE(class _OtherExtents)
+    _LIBCUDACXX_REQUIRES((extents_type::rank() == _OtherExtents::rank()))
+    __MDSPAN_INLINE_FUNCTION friend constexpr bool
+    operator!=(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
     {
       return __impl::_not_eq_impl(__lhs, __rhs);
     }
diff --git a/libcudacxx/include/cuda/std/__mdspan/macros.h b/libcudacxx/include/cuda/std/__mdspan/macros.h
index 0eba30a718..0aa54e0330 100644
--- a/libcudacxx/include/cuda/std/__mdspan/macros.h
+++ b/libcudacxx/include/cuda/std/__mdspan/macros.h
@@ -54,6 +54,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__concepts/__concept_macros.h>
 #include <cuda/std/__mdspan/config.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_void.h>
@@ -249,167 +250,6 @@
 // </editor-fold> end Preprocessor helpers }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="Concept emulation"> {{{1
-
-// These compatibility macros don't help with partial ordering, but they should do the trick
-// for what we need to do with concepts in mdspan
-#  ifdef __MDSPAN_USE_CONCEPTS
-#    define __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) \
-      >                                        \
-          requires REQ
-#    define __MDSPAN_FUNCTION_REQUIRES(PAREN_PREQUALS, FNAME, PAREN_PARAMS, QUALS, REQ) \
-      __MDSPAN_PP_REMOVE_PARENS(PAREN_PREQUALS)                                         \
-      FNAME PAREN_PARAMS QUALS                                                          \
-        requires REQ /**/
-#  else
-#    define __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) , typename _CUDA_VSTD::enable_if<(REQ), int>::type = 0 >
-#    define __MDSPAN_FUNCTION_REQUIRES(PAREN_PREQUALS, FNAME, PAREN_PARAMS, QUALS, REQ)                             \
-      __MDSPAN_TEMPLATE_REQUIRES(                                                                                   \
-        class __function_requires_ignored = void, (_CUDA_VSTD::is_void<__function_requires_ignored>::value && REQ)) \
-      __MDSPAN_PP_REMOVE_PARENS(PAREN_PREQUALS) FNAME PAREN_PARAMS QUALS /**/
-#  endif
-
-#  if defined(__MDSPAN_COMPILER_MSVC)
-#    define __MDSPAN_TEMPLATE_REQUIRES(...)                                                                        \
-      __MDSPAN_PP_CAT(__MDSPAN_PP_CAT(__MDSPAN_TEMPLATE_REQUIRES_, __MDSPAN_PP_COUNT(__VA_ARGS__))(__VA_ARGS__), ) \
-      /**/
-#  else
-#    define __MDSPAN_TEMPLATE_REQUIRES(...)                                                                       \
-      __MDSPAN_PP_EVAL(__MDSPAN_PP_CAT(__MDSPAN_TEMPLATE_REQUIRES_, __MDSPAN_PP_COUNT(__VA_ARGS__)), __VA_ARGS__) \
-      /**/
-#  endif
-
-#  define __MDSPAN_TEMPLATE_REQUIRES_2(TP1, REQ)      template <TP1 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_3(TP1, TP2, REQ) template <TP1, TP2 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_4(TP1, TP2, TP3, REQ) \
-    template <TP1, TP2, TP3 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_5(TP1, TP2, TP3, TP4, REQ) \
-    template <TP1, TP2, TP3, TP4 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_6(TP1, TP2, TP3, TP4, TP5, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_7(TP1, TP2, TP3, TP4, TP5, TP6, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_8(TP1, TP2, TP3, TP4, TP5, TP6, TP7, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_9(TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_10(TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_11(TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_12(TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_13(TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_14(TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_15(TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, TP14, REQ) \
-    template <TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, TP14 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_16(                                                  \
-    TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, TP14, TP15, REQ) \
-    template <TP1,                                                                        \
-              TP2,                                                                        \
-              TP3,                                                                        \
-              TP4,                                                                        \
-              TP5,                                                                        \
-              TP6,                                                                        \
-              TP7,                                                                        \
-              TP8,                                                                        \
-              TP9,                                                                        \
-              TP10,                                                                       \
-              TP11,                                                                       \
-              TP12,                                                                       \
-              TP13,                                                                       \
-              TP14,                                                                       \
-              TP15 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_17(                                                        \
-    TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, TP14, TP15, TP16, REQ) \
-    template <TP1,                                                                              \
-              TP2,                                                                              \
-              TP3,                                                                              \
-              TP4,                                                                              \
-              TP5,                                                                              \
-              TP6,                                                                              \
-              TP7,                                                                              \
-              TP8,                                                                              \
-              TP9,                                                                              \
-              TP10,                                                                             \
-              TP11,                                                                             \
-              TP12,                                                                             \
-              TP13,                                                                             \
-              TP14,                                                                             \
-              TP15,                                                                             \
-              TP16 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_18(                                                              \
-    TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, TP14, TP15, TP16, TP17, REQ) \
-    template <TP1,                                                                                    \
-              TP2,                                                                                    \
-              TP3,                                                                                    \
-              TP4,                                                                                    \
-              TP5,                                                                                    \
-              TP6,                                                                                    \
-              TP7,                                                                                    \
-              TP8,                                                                                    \
-              TP9,                                                                                    \
-              TP10,                                                                                   \
-              TP11,                                                                                   \
-              TP12,                                                                                   \
-              TP13,                                                                                   \
-              TP14,                                                                                   \
-              TP15,                                                                                   \
-              TP16,                                                                                   \
-              TP17 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_19(                                                                    \
-    TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, TP14, TP15, TP16, TP17, TP18, REQ) \
-    template <TP1,                                                                                          \
-              TP2,                                                                                          \
-              TP3,                                                                                          \
-              TP4,                                                                                          \
-              TP5,                                                                                          \
-              TP6,                                                                                          \
-              TP7,                                                                                          \
-              TP8,                                                                                          \
-              TP9,                                                                                          \
-              TP10,                                                                                         \
-              TP11,                                                                                         \
-              TP12,                                                                                         \
-              TP13,                                                                                         \
-              TP14,                                                                                         \
-              TP15,                                                                                         \
-              TP16,                                                                                         \
-              TP17,                                                                                         \
-              TP18 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-#  define __MDSPAN_TEMPLATE_REQUIRES_20(                                                                          \
-    TP1, TP2, TP3, TP4, TP5, TP6, TP7, TP8, TP9, TP10, TP11, TP12, TP13, TP14, TP15, TP16, TP17, TP18, TP19, REQ) \
-    template <TP1,                                                                                                \
-              TP2,                                                                                                \
-              TP3,                                                                                                \
-              TP4,                                                                                                \
-              TP5,                                                                                                \
-              TP6,                                                                                                \
-              TP7,                                                                                                \
-              TP8,                                                                                                \
-              TP9,                                                                                                \
-              TP10,                                                                                               \
-              TP11,                                                                                               \
-              TP12,                                                                                               \
-              TP13,                                                                                               \
-              TP14,                                                                                               \
-              TP15,                                                                                               \
-              TP16,                                                                                               \
-              TP17,                                                                                               \
-              TP18,                                                                                               \
-              TP19 __MDSPAN_CLOSE_ANGLE_REQUIRES(REQ) /**/
-
-#  define __MDSPAN_INSTANTIATE_ONLY_IF_USED                                                           \
-    __MDSPAN_TEMPLATE_REQUIRES(class __instantiate_only_if_used_tparam = void,                        \
-                               (_CCCL_TRAIT(_CUDA_VSTD::is_void, __instantiate_only_if_used_tparam))) \
-    /**/
-
-// </editor-fold> end Concept emulation }}}1
-//==============================================================================
-
 //==============================================================================
 // <editor-fold desc="Return type deduction"> {{{1
 
diff --git a/libcudacxx/include/cuda/std/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
index 1103663025..27e6a57a94 100644
--- a/libcudacxx/include/cuda/std/__mdspan/mdspan.h
+++ b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
@@ -168,22 +168,21 @@ class mdspan
   __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan()
     requires(
               // Directly using rank_dynamic()>0 here doesn't work for nvcc
-              (extents_type::rank_dynamic() > 0) && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, data_handle_type)
-              && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, mapping_type)
-              && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type))
+              (extents_type::rank_dynamic() > 0) && _CCCL_TRAIT(is_default_constructible, data_handle_type)
+              && _CCCL_TRAIT(is_default_constructible, mapping_type)
+              && _CCCL_TRAIT(is_default_constructible, accessor_type))
   = default;
 #  endif
   __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan(const mdspan&) = default;
   __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan(mdspan&&)      = default;
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class... _SizeTypes,
-    /* requires */ (
-      __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeTypes, index_type) /* && ... */)
-      && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
-      && ((sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic()))
-      && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type)
-      && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type)))
+  _LIBCUDACXX_TEMPLATE(class... _SizeTypes)
+  _LIBCUDACXX_REQUIRES(
+    __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */)
+      _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
+        _LIBCUDACXX_AND((sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic()))
+          _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
+            _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
   __MDSPAN_INLINE_FUNCTION
   explicit constexpr mdspan(data_handle_type __p, _SizeTypes... __dynamic_extents)
       // TODO @proposal-bug shouldn't I be allowed to do `move(__p)` here?
@@ -193,15 +192,11 @@ class mdspan
                            accessor_type()))
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _SizeType,
-    size_t _Np,
-    /* requires */
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType)
-     && ((_Np == rank()) || (_Np == rank_dynamic()))
-     && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type)))
+  _LIBCUDACXX_TEMPLATE(class _SizeType, size_t _Np)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) _LIBCUDACXX_AND _CCCL_TRAIT(
+    is_nothrow_constructible, index_type, _SizeType) _LIBCUDACXX_AND((_Np == rank()) || (_Np == rank_dynamic()))
+                         _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
+                           _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
   __MDSPAN_INLINE_FUNCTION
   constexpr mdspan(data_handle_type __p, const _CUDA_VSTD::array<_SizeType, _Np>& __dynamic_extents)
@@ -209,15 +204,11 @@ class mdspan
                   __map_acc_pair_t(mapping_type(extents_type(__dynamic_extents)), accessor_type()))
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _SizeType,
-    size_t _Np,
-    /* requires */
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType)
-     && ((_Np == rank()) || (_Np == rank_dynamic()))
-     && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type)))
+  _LIBCUDACXX_TEMPLATE(class _SizeType, size_t _Np)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type) _LIBCUDACXX_AND _CCCL_TRAIT(
+    is_nothrow_constructible, index_type, _SizeType) _LIBCUDACXX_AND((_Np == rank()) || (_Np == rank_dynamic()))
+                         _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
+                           _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
   __MDSPAN_INLINE_FUNCTION
   constexpr mdspan(data_handle_type __p, _CUDA_VSTD::span<_SizeType, _Np> __dynamic_extents)
@@ -225,23 +216,16 @@ class mdspan
                   __map_acc_pair_t(mapping_type(extents_type(_CUDA_VSTD::as_const(__dynamic_extents))), accessor_type()))
   {}
 
-  __MDSPAN_FUNCTION_REQUIRES(
-    (__MDSPAN_INLINE_FUNCTION constexpr),
-    mdspan,
-    (data_handle_type __p, const extents_type& __exts),
-    ,
-    /* requires */
-    (_CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, mapping_type, extents_type)))
+  _LIBCUDACXX_TEMPLATE(bool _Is_default_constructible = _CCCL_TRAIT(is_default_constructible, accessor_type))
+  _LIBCUDACXX_REQUIRES(
+    _Is_default_constructible _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type))
+  __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, const extents_type& __exts)
       : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(mapping_type(__exts), accessor_type()))
   {}
 
-  __MDSPAN_FUNCTION_REQUIRES(
-    (__MDSPAN_INLINE_FUNCTION constexpr),
-    mdspan,
-    (data_handle_type __p, const mapping_type& __m),
-    ,
-    /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_default_constructible, accessor_type)))
+  _LIBCUDACXX_TEMPLATE(bool _Is_default_constructible = _CCCL_TRAIT(is_default_constructible, accessor_type))
+  _LIBCUDACXX_REQUIRES(_Is_default_constructible)
+  __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, const mapping_type& __m)
       : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(__m, accessor_type()))
   {}
 
@@ -250,23 +234,17 @@ class mdspan
       : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(__m, __a))
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _OtherElementType,
-    class _OtherExtents,
-    class _OtherLayoutPolicy,
-    class _OtherAccessor,
-    /* requires */
-    (_CCCL_TRAIT(
-       _CUDA_VSTD::is_constructible, mapping_type, typename _OtherLayoutPolicy::template mapping<_OtherExtents>)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_constructible, accessor_type, _OtherAccessor)))
+  _LIBCUDACXX_TEMPLATE(class _OtherElementType, class _OtherExtents, class _OtherLayoutPolicy, class _OtherAccessor)
+  _LIBCUDACXX_REQUIRES(
+    _CCCL_TRAIT(is_constructible, mapping_type, typename _OtherLayoutPolicy::template mapping<_OtherExtents>)
+      _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, accessor_type, _OtherAccessor))
   __MDSPAN_INLINE_FUNCTION
   constexpr mdspan(const mdspan<_OtherElementType, _OtherExtents, _OtherLayoutPolicy, _OtherAccessor>& __other)
       : __members(__other.__ptr_ref(), __map_acc_pair_t(__other.__mapping_ref(), __other.__accessor_ref()))
   {
-    static_assert(
-      _CCCL_TRAIT(_CUDA_VSTD::is_constructible, data_handle_type, typename _OtherAccessor::data_handle_type),
-      "Incompatible data_handle_type for mdspan construction");
-    static_assert(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents),
+    static_assert(_CCCL_TRAIT(is_constructible, data_handle_type, typename _OtherAccessor::data_handle_type),
+                  "Incompatible data_handle_type for mdspan construction");
+    static_assert(_CCCL_TRAIT(is_constructible, extents_type, _OtherExtents),
                   "Incompatible extents for mdspan construction");
     /*
      * TODO: Check precondition
@@ -287,12 +265,11 @@ class mdspan
   // [mdspan.basic.mapping], mdspan mapping domain multidimensional index to access codomain element
 
 #  if __MDSPAN_USE_BRACKET_OPERATOR
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class... _SizeTypes,
-    /* requires */ (
-      __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeTypes, index_type) /* && ... */)
-      && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
-      && (rank() == sizeof...(_SizeTypes))))
+  _LIBCUDACXX_TEMPLATE(class... _SizeTypes)
+  _LIBCUDACXX_REQUIRES(
+    __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */)
+      _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
+        _LIBCUDACXX_AND(rank() == sizeof...(_SizeTypes)))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](_SizeTypes... __indices) const
   {
@@ -300,20 +277,18 @@ class mdspan
   }
 #  endif
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _SizeType,
-    /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type)
-                    && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType)))
+  _LIBCUDACXX_TEMPLATE(class _SizeType)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type)
+                         _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](const _CUDA_VSTD::array<_SizeType, rank()>& __indices) const
   {
     return __impl::template __callop<reference>(*this, __indices);
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _SizeType,
-    /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type)
-                    && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType)))
+  _LIBCUDACXX_TEMPLATE(class _SizeType)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type)
+                         _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](_CUDA_VSTD::span<_SizeType, rank()> __indices) const
   {
@@ -321,10 +296,9 @@ class mdspan
   }
 
 #  if !__MDSPAN_USE_BRACKET_OPERATOR
-  __MDSPAN_TEMPLATE_REQUIRES(class _Index,
-                             /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Index, index_type)
-                                             && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Index)
-                                             && extents_type::rank() == 1))
+  _LIBCUDACXX_TEMPLATE(class _Index)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _Index, index_type) _LIBCUDACXX_AND _CCCL_TRAIT(
+    is_nothrow_constructible, index_type, _Index) _LIBCUDACXX_AND(extents_type::rank() == 1))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](_Index __idx) const
   {
@@ -333,32 +307,29 @@ class mdspan
 #  endif
 
 #  if __MDSPAN_USE_PAREN_OPERATOR
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class... _SizeTypes,
-    /* requires */ (
-      __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeTypes, index_type) /* && ... */)
-      && __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
-      && extents_type::rank() == sizeof...(_SizeTypes)))
+  _LIBCUDACXX_TEMPLATE(class... _SizeTypes)
+  _LIBCUDACXX_REQUIRES(
+    __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */)
+      _LIBCUDACXX_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
+        _LIBCUDACXX_AND(extents_type::rank() == sizeof...(_SizeTypes)))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator()(_SizeTypes... __indices) const
   {
     return __accessor_ref().access(__ptr_ref(), __mapping_ref()(__indices...));
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _SizeType,
-    /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type)
-                    && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType)))
+  _LIBCUDACXX_TEMPLATE(class _SizeType)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type)
+                         _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator()(const _CUDA_VSTD::array<_SizeType, rank()>& __indices) const
   {
     return __impl::template __callop<reference>(*this, __indices);
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(
-    class _SizeType,
-    /* requires */ (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SizeType, index_type)
-                    && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _SizeType)))
+  _LIBCUDACXX_TEMPLATE(class _SizeType)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _SizeType, index_type)
+                         _LIBCUDACXX_AND _CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeType))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator()(_CUDA_VSTD::span<_SizeType, rank()> __indices) const
   {
@@ -470,17 +441,18 @@ class mdspan
 };
 
 #  if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
-__MDSPAN_TEMPLATE_REQUIRES(
-  class _ElementType,
-  class... _SizeTypes,
-  /* requires */ __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_integral, _SizeTypes) /* && ... */) && (sizeof...(_SizeTypes) > 0))
+_LIBCUDACXX_TEMPLATE(class _ElementType, class... _SizeTypes)
+_LIBCUDACXX_REQUIRES(__MDSPAN_FOLD_AND(_CCCL_TRAIT(is_integral, _SizeTypes) /* && ... */)
+                       _LIBCUDACXX_AND(sizeof...(_SizeTypes) > 0))
 _CCCL_HOST_DEVICE explicit mdspan(_ElementType*,
                                   _SizeTypes...) -> mdspan<_ElementType, dextents<size_t, sizeof...(_SizeTypes)>>;
 
-__MDSPAN_TEMPLATE_REQUIRES(class _Pointer, (_CCCL_TRAIT(is_pointer, _CUDA_VSTD::remove_reference_t<_Pointer>)))
+_LIBCUDACXX_TEMPLATE(class _Pointer)
+_LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_pointer, _CUDA_VSTD::remove_reference_t<_Pointer>))
 _CCCL_HOST_DEVICE
 mdspan(_Pointer&&) -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents<size_t>>;
-__MDSPAN_TEMPLATE_REQUIRES(class _CArray, (_CCCL_TRAIT(is_array, _CArray) && (rank_v<_CArray> == 1)))
+_LIBCUDACXX_TEMPLATE(class _CArray)
+_LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_array, _CArray) _LIBCUDACXX_AND(rank_v<_CArray> == 1))
 _CCCL_HOST_DEVICE mdspan(_CArray&)
   -> mdspan<_CUDA_VSTD::remove_all_extents_t<_CArray>, extents<size_t, _CUDA_VSTD::extent_v<_CArray, 0>>>;
 
diff --git a/libcudacxx/include/cuda/std/__mdspan/static_array.h b/libcudacxx/include/cuda/std/__mdspan/static_array.h
index de511fe2e6..886f782065 100644
--- a/libcudacxx/include/cuda/std/__mdspan/static_array.h
+++ b/libcudacxx/include/cuda/std/__mdspan/static_array.h
@@ -162,21 +162,17 @@ class __partially_static_array_impl<
   {}
 
   __MDSPAN_INLINE_FUNCTION constexpr explicit __partially_static_array_impl(
-    _CUDA_VSTD::array<_Tp, sizeof...(_Idxs)> const& __vals) noexcept
+    array<_Tp, sizeof...(_Idxs)> const& __vals) noexcept
       : __partially_static_array_impl(__construct_psa_from_all_exts_values_tag, _CUDA_VSTD::get<_Idxs>(__vals)...)
   {}
 
-  // clang-format off
-  __MDSPAN_FUNCTION_REQUIRES(
-    (__MDSPAN_INLINE_FUNCTION constexpr explicit),
-    __partially_static_array_impl,
-    (_CUDA_VSTD::array<_Tp, __size_dynamic> const &__vals), noexcept,
-    /* requires */
-      (sizeof...(_Idxs) != __size_dynamic)
-  ): __partially_static_array_impl(
-       __construct_psa_from_dynamic_exts_values_tag,
-       _CUDA_VSTD::get<_IdxsDynamicIdxs>(__vals)...) {}
-  // clang-format on
+  _LIBCUDACXX_TEMPLATE(bool _SizeMatches = (sizeof...(_Idxs) != __size_dynamic))
+  _LIBCUDACXX_REQUIRES(_SizeMatches)
+  __MDSPAN_INLINE_FUNCTION constexpr explicit __partially_static_array_impl(
+    array<_Tp, __size_dynamic> const& __vals) noexcept
+    __partially_static_array_impl(__construct_psa_from_dynamic_exts_values_tag,
+                                  _CUDA_VSTD::get<_IdxsDynamicIdxs>(__vals)...)
+  {}
 
   template <class _Up,
             class _static_u,
diff --git a/libcudacxx/include/cuda/std/__mdspan/submdspan.h b/libcudacxx/include/cuda/std/__mdspan/submdspan.h
index a91d35ecf4..eb513daeab 100644
--- a/libcudacxx/include/cuda/std/__mdspan/submdspan.h
+++ b/libcudacxx/include/cuda/std/__mdspan/submdspan.h
@@ -521,19 +521,15 @@ struct _is_layout_stride<layout_stride> : true_type
 
 //==============================================================================
 
-__MDSPAN_TEMPLATE_REQUIRES(
-  class _ET,
-  class _EXT,
-  class _LP,
-  class _AP,
-  class... _SliceSpecs,
-  /* requires */
-  ((_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left) || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right)
-    || __detail::_is_layout_stride<_LP>::value)
-   && __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t)
-                         || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple<size_t, size_t>)
-                         || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t)) /* && ... */)
-   && sizeof...(_SliceSpecs) == _EXT::rank()))
+_LIBCUDACXX_TEMPLATE(class _ET, class _EXT, class _LP, class _AP, class... _SliceSpecs)
+_LIBCUDACXX_REQUIRES(
+  (_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left) || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right)
+   || __detail::_is_layout_stride<_LP>::value)
+    _LIBCUDACXX_AND __MDSPAN_FOLD_AND(
+      (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t)
+       || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple<size_t, size_t>)
+       || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t)) /* && ... */)
+      _LIBCUDACXX_AND(sizeof...(_SliceSpecs) == _EXT::rank()))
 __MDSPAN_INLINE_FUNCTION
 __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
   (constexpr submdspan(mdspan<_ET, _EXT, _LP, _AP> const& __src, _SliceSpecs... __slices) noexcept),
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp
index 32e7c1cd84..fd84ddcc51 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp
@@ -93,8 +93,8 @@ class layout_foo::mapping
       : __extents(__exts)
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)))
+  _LIBCUDACXX_TEMPLATE(class OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to
                                                                                                  // comma
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
@@ -107,18 +107,18 @@ class layout_foo::mapping
      */
   }
 
-  __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)))
-  __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to
-                                                                                                 // comma
+  _LIBCUDACXX_TEMPLATE(class OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))
+  __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due
+                                                                                                 // to comma
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
     cuda::std::layout_right::mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(other.extents())
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)
-                                             && (extents_type::rank() <= 1)))
+  _LIBCUDACXX_TEMPLATE(class OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)
+                       && (extents_type::rank() <= 1))
   __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to
                                                                                                  // comma
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
@@ -126,8 +126,8 @@ class layout_foo::mapping
       : __extents(other.extents())
   {}
 
-  __MDSPAN_TEMPLATE_REQUIRES(class OtherExtents,
-                             /* requires */ (_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents)))
+  _LIBCUDACXX_TEMPLATE(class OtherExtents)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
   __MDSPAN_INLINE_FUNCTION constexpr mapping(
     cuda::std::layout_stride::mapping<OtherExtents> const& other) // NOLINT(google-explicit-constructor)

From 74739348a20a0efd4189d5545643192cdc1830d6 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Fri, 9 Aug 2024 08:36:41 -0700
Subject: [PATCH 14/33] Un-doc functions taking debug_synchronous (#2209)

* undoc functions taking debug_synchronous
---
 cub/cub/device/device_adjacent_difference.cuh |  8 +++++
 cub/cub/device/device_histogram.cuh           | 16 ++++++++++
 cub/cub/device/device_merge_sort.cuh          | 12 +++++++
 cub/cub/device/device_partition.cuh           |  6 ++++
 cub/cub/device/device_reduce.cuh              | 14 ++++++++
 cub/cub/device/device_run_length_encode.cuh   |  4 +++
 cub/cub/device/device_scan.cuh                | 28 ++++++++++++++++
 .../device/device_segmented_radix_sort.cuh    | 16 ++++++++++
 cub/cub/device/device_segmented_reduce.cuh    | 12 +++++++
 cub/cub/device/device_segmented_sort.cuh      | 32 +++++++++++++++++++
 cub/cub/device/device_select.cuh              | 12 +++++++
 cub/cub/device/device_spmv.cuh                |  2 ++
 .../dispatch/dispatch_adjacent_difference.cuh |  8 +++--
 .../device/dispatch/dispatch_histogram.cuh    | 20 ++++++++----
 cub/cub/device/dispatch/dispatch_reduce.cuh   | 16 +++++++---
 .../dispatch/dispatch_reduce_by_key.cuh       |  6 ++--
 cub/cub/device/dispatch/dispatch_rle.cuh      |  6 ++--
 cub/cub/device/dispatch/dispatch_scan.cuh     |  8 +++--
 .../device/dispatch/dispatch_scan_by_key.cuh  |  8 +++--
 .../dispatch/dispatch_segmented_sort.cuh      | 10 ++++--
 .../device/dispatch/dispatch_select_if.cuh    |  6 ++--
 .../device/dispatch/dispatch_spmv_orig.cuh    | 14 +++++---
 .../dispatch/dispatch_three_way_partition.cuh |  6 ++--
 .../dispatch/dispatch_unique_by_key.cuh       |  8 +++--
 24 files changed, 244 insertions(+), 34 deletions(-)

diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh
index 750f7a974c..53d8cc2cfb 100644
--- a/cub/cub/device/device_adjacent_difference.cuh
+++ b/cub/cub/device/device_adjacent_difference.cuh
@@ -267,6 +267,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(
     void* d_temp_storage,
@@ -282,6 +283,7 @@ public:
 
     return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
@@ -394,6 +396,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(
     void* d_temp_storage,
@@ -408,6 +411,7 @@ public:
 
     return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
@@ -539,6 +543,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(
     void* d_temp_storage,
@@ -554,6 +559,7 @@ public:
 
     return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
@@ -655,6 +661,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(
     void* d_temp_storage,
@@ -669,6 +676,7 @@ public:
 
     return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh
index 989342d0a7..46f4bee557 100644
--- a/cub/cub/device/device_histogram.cuh
+++ b/cub/cub/device/device_histogram.cuh
@@ -206,6 +206,7 @@ struct DeviceHistogram
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
     void* d_temp_storage,
@@ -232,6 +233,7 @@ struct DeviceHistogram
       num_samples,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
@@ -384,6 +386,7 @@ struct DeviceHistogram
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
     void* d_temp_storage,
@@ -414,6 +417,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
@@ -584,6 +588,7 @@ struct DeviceHistogram
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -615,6 +620,7 @@ struct DeviceHistogram
       num_pixels,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of
@@ -830,6 +836,7 @@ struct DeviceHistogram
       is_byte_sample);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -865,6 +872,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @}  end member group
   //! @name Custom bin ranges
@@ -991,6 +999,7 @@ struct DeviceHistogram
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
     void* d_temp_storage,
@@ -1008,6 +1017,7 @@ struct DeviceHistogram
     return HistogramRange(
       d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
@@ -1147,6 +1157,7 @@ struct DeviceHistogram
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
     void* d_temp_storage,
@@ -1175,6 +1186,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples
@@ -1334,6 +1346,7 @@ struct DeviceHistogram
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -1356,6 +1369,7 @@ struct DeviceHistogram
     return MultiHistogramRange(
       d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
@@ -1560,6 +1574,7 @@ struct DeviceHistogram
       is_byte_sample);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -1593,6 +1608,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //@}  end member group
 };
diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh
index 3c28e405ab..293aaecce9 100644
--- a/cub/cub/device/device_merge_sort.cuh
+++ b/cub/cub/device/device_merge_sort.cuh
@@ -246,6 +246,7 @@ public:
     return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -262,6 +263,7 @@ public:
     return SortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * @brief Sorts items using a merge sorting method.
@@ -409,6 +411,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyInputIteratorT,
             typename ValueInputIteratorT,
             typename KeyIteratorT,
@@ -440,6 +443,7 @@ public:
       compare_op,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -566,6 +570,7 @@ public:
     return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -581,6 +586,7 @@ public:
     return SortKeys<KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -723,6 +729,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(
     void* d_temp_storage,
@@ -739,6 +746,7 @@ public:
     return SortKeysCopy<KeyInputIteratorT, KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * @brief Sorts items using a merge sorting method.
@@ -849,6 +857,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
     void* d_temp_storage,
@@ -865,6 +874,7 @@ public:
     return StableSortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * @brief Sorts items using a merge sorting method.
@@ -966,6 +976,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
     void* d_temp_storage,
@@ -981,6 +992,7 @@ public:
     return StableSortKeys<KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * @brief Sorts items using a merge sorting method.
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 5c24a0ec20..08a2ae531f 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -206,6 +206,7 @@ struct DevicePartition
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
@@ -223,6 +224,7 @@ struct DevicePartition
     return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into
@@ -367,6 +369,7 @@ struct DevicePartition
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
@@ -384,6 +387,7 @@ struct DevicePartition
     return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   template <bool IS_DESCENDING,
@@ -656,6 +660,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT,
             typename FirstOutputIteratorT,
             typename SecondOutputIteratorT,
@@ -698,6 +703,7 @@ public:
       select_second_part_op,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 54062bd9ea..a31e641920 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -205,6 +205,7 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
     void* d_temp_storage,
@@ -222,6 +223,7 @@ struct DeviceReduce
     return Reduce<InputIteratorT, OutputIteratorT, ReductionOpT, T>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide sum using the addition (``+``) operator.
@@ -328,6 +330,7 @@ struct DeviceReduce
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Sum(void* d_temp_storage,
@@ -342,6 +345,7 @@ struct DeviceReduce
 
     return Sum<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide minimum using the less-than (``<``) operator.
@@ -452,6 +456,7 @@ struct DeviceReduce
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Min(void* d_temp_storage,
@@ -466,6 +471,7 @@ struct DeviceReduce
 
     return Min<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
@@ -585,6 +591,7 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
     void* d_temp_storage,
@@ -599,6 +606,7 @@ struct DeviceReduce
 
     return ArgMin<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide maximum using the greater-than (``>``) operator.
@@ -707,6 +715,7 @@ struct DeviceReduce
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Max(void* d_temp_storage,
@@ -721,6 +730,7 @@ struct DeviceReduce
 
     return Max<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Finds the first device-wide maximum using the greater-than (``>``)
@@ -844,6 +854,7 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
     void* d_temp_storage,
@@ -858,6 +869,7 @@ struct DeviceReduce
 
     return ArgMax<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Fuses transform and reduce operations
@@ -1183,6 +1195,7 @@ struct DeviceReduce
                          stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeysInputIteratorT,
             typename UniqueOutputIteratorT,
             typename ValuesInputIteratorT,
@@ -1221,6 +1234,7 @@ struct DeviceReduce
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index f19102e0cc..f3b1a3e669 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -229,6 +229,7 @@ struct DeviceRunLengthEncode
                           stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT,
             typename UniqueOutputIteratorT,
             typename LengthsOutputIteratorT,
@@ -249,6 +250,7 @@ struct DeviceRunLengthEncode
     return Encode<InputIteratorT, UniqueOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Enumerates the starting offsets and lengths of all non-trivial runs
@@ -382,6 +384,7 @@ struct DeviceRunLengthEncode
                          stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT,
             typename OffsetsOutputIteratorT,
             typename LengthsOutputIteratorT,
@@ -403,6 +406,7 @@ struct DeviceRunLengthEncode
     return NonTrivialRuns<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index c8a36f0255..29f3cf6c1e 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -194,6 +194,7 @@ struct DeviceScan
       d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), detail::InputValue<InitT>(init_value), num_items, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
     void* d_temp_storage,
@@ -209,6 +210,7 @@ struct DeviceScan
     return ExclusiveSum<InputIteratorT, OutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide exclusive prefix sum in-place.
@@ -283,6 +285,7 @@ struct DeviceScan
     return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename IteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
     void* d_temp_storage,
@@ -296,6 +299,7 @@ struct DeviceScan
 
     return ExclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
@@ -426,6 +430,7 @@ struct DeviceScan
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
@@ -443,6 +448,7 @@ struct DeviceScan
     return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
@@ -550,6 +556,7 @@ struct DeviceScan
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename IteratorT, typename ScanOpT, typename InitValueT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
@@ -566,6 +573,7 @@ struct DeviceScan
     return ExclusiveScan<IteratorT, ScanOpT, InitValueT>(
       d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
@@ -704,6 +712,7 @@ struct DeviceScan
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename ScanOpT,
@@ -725,6 +734,7 @@ struct DeviceScan
     return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, InitValueIterT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor.
@@ -835,6 +845,7 @@ struct DeviceScan
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename IteratorT, typename ScanOpT, typename InitValueT, typename InitValueIterT = InitValueT*>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
@@ -851,6 +862,7 @@ struct DeviceScan
     return ExclusiveScan<IteratorT, ScanOpT, InitValueT, InitValueIterT>(
       d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @}  end member group
   //! @name Inclusive scans
@@ -949,6 +961,7 @@ struct DeviceScan
       d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
     void* d_temp_storage,
@@ -964,6 +977,7 @@ struct DeviceScan
     return InclusiveSum<InputIteratorT, OutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide inclusive prefix sum in-place.
@@ -1037,6 +1051,7 @@ struct DeviceScan
     return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename IteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
     void* d_temp_storage,
@@ -1050,6 +1065,7 @@ struct DeviceScan
 
     return InclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
@@ -1266,6 +1282,7 @@ struct DeviceScan
                                 stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
@@ -1282,6 +1299,7 @@ struct DeviceScan
     return InclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
@@ -1379,6 +1397,7 @@ struct DeviceScan
     return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename IteratorT, typename ScanOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
@@ -1393,6 +1412,7 @@ struct DeviceScan
 
     return InclusiveScan<IteratorT, ScanOpT>(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide exclusive prefix sum-by-key with key equality
@@ -1530,6 +1550,7 @@ struct DeviceScan
                          stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -1550,6 +1571,7 @@ struct DeviceScan
     return ExclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
       d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan-by-key using the
@@ -1729,6 +1751,7 @@ struct DeviceScan
                          stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -1767,6 +1790,7 @@ struct DeviceScan
       equality_op,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
@@ -1898,6 +1922,7 @@ struct DeviceScan
                          stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -1918,6 +1943,7 @@ struct DeviceScan
     return InclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
       d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide inclusive prefix scan-by-key using the
@@ -2081,6 +2107,7 @@ struct DeviceScan
                          stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -2103,6 +2130,7 @@ struct DeviceScan
     return InclusiveScanByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, ScanOpT, EqualityOpT>(
       d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh
index cff9c22bce..eb6eecdcf3 100644
--- a/cub/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/cub/device/device_segmented_radix_sort.cuh
@@ -264,6 +264,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -298,6 +299,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required)
@@ -473,6 +475,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -503,6 +506,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required).
@@ -678,6 +682,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -712,6 +717,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required).
@@ -891,6 +897,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -921,6 +928,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @}  end member group
   //! @name Keys-only
@@ -1083,6 +1091,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -1113,6 +1122,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required).
@@ -1280,6 +1290,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -1308,6 +1319,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required).
@@ -1466,6 +1478,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -1496,6 +1509,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required).
@@ -1661,6 +1675,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -1689,6 +1704,7 @@ public:
       end_bit,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index 90a1729685..6a0875734e 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -272,6 +272,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
@@ -305,6 +306,7 @@ public:
       initial_value,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide segmented sum using the addition (``+``) operator.
@@ -419,6 +421,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Sum(void* d_temp_storage,
@@ -436,6 +439,7 @@ public:
     return Sum<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide segmented minimum using the less-than (``<``) operator.
@@ -558,6 +562,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Min(void* d_temp_storage,
@@ -575,6 +580,7 @@ public:
     return Min<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Finds the first device-wide minimum in each segment using the
@@ -726,6 +732,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
     void* d_temp_storage,
@@ -743,6 +750,7 @@ public:
     return ArgMin<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
@@ -859,6 +867,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Max(void* d_temp_storage,
@@ -876,6 +885,7 @@ public:
     return Max<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Finds the first device-wide maximum in each segment using the
@@ -1030,6 +1040,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
     void* d_temp_storage,
@@ -1047,6 +1058,7 @@ public:
     return ArgMax<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index 2aeb145c5d..67a22c5e54 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -306,6 +306,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -332,6 +333,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -501,6 +503,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -527,6 +530,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -698,6 +702,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -715,6 +720,7 @@ public:
     return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -887,6 +893,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -904,6 +911,7 @@ public:
     return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of keys into ascending order. Approximately
@@ -1041,6 +1049,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
     void* d_temp_storage,
@@ -1067,6 +1076,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of keys into descending order.
@@ -1204,6 +1214,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
     void* d_temp_storage,
@@ -1230,6 +1241,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of keys into ascending order.
@@ -1369,6 +1381,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
     void* d_temp_storage,
@@ -1386,6 +1399,7 @@ public:
     return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of keys into descending order.
@@ -1524,6 +1538,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
     void* d_temp_storage,
@@ -1541,6 +1556,7 @@ public:
     return StableSortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -1741,6 +1757,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -1771,6 +1788,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -1967,6 +1985,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -1997,6 +2016,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -2193,6 +2213,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -2219,6 +2240,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   // Internal version without NVTX range
@@ -2414,6 +2436,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -2440,6 +2463,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of key-value pairs into ascending order.
@@ -2599,6 +2623,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
     void* d_temp_storage,
@@ -2629,6 +2654,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order.
@@ -2788,6 +2814,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
     void* d_temp_storage,
@@ -2818,6 +2845,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of key-value pairs into ascending order.
@@ -2983,6 +3011,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
     void* d_temp_storage,
@@ -3009,6 +3038,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order.
@@ -3173,6 +3203,7 @@ public:
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
     void* d_temp_storage,
@@ -3199,6 +3230,7 @@ public:
       d_end_offsets,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 3113d6ca82..703a912829 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -203,6 +203,7 @@ struct DeviceSelect
                        stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
@@ -220,6 +221,7 @@ struct DeviceSelect
     return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``.
@@ -339,6 +341,7 @@ struct DeviceSelect
                            stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename IteratorT, typename FlagIterator, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
@@ -355,6 +358,7 @@ struct DeviceSelect
     return Flagged<IteratorT, FlagIterator, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``.
@@ -494,6 +498,7 @@ struct DeviceSelect
                        stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
@@ -511,6 +516,7 @@ struct DeviceSelect
     return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Uses the ``select_op`` functor to selectively compact items in ``d_data``.
@@ -642,6 +648,7 @@ struct DeviceSelect
                            stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename IteratorT, typename NumSelectedIteratorT, typename SelectOp>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
@@ -658,6 +665,7 @@ struct DeviceSelect
     return If<IteratorT, NumSelectedIteratorT, SelectOp>(
       d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the
@@ -1003,6 +1011,7 @@ struct DeviceSelect
                        stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
@@ -1019,6 +1028,7 @@ struct DeviceSelect
     return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
@@ -1320,6 +1330,7 @@ struct DeviceSelect
       stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename KeyInputIteratorT,
             typename ValueInputIteratorT,
             typename KeyOutputIteratorT,
@@ -1356,6 +1367,7 @@ struct DeviceSelect
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 6d6d126482..32ac433f3e 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -207,6 +207,7 @@ struct DeviceSpmv
     return DispatchSpmv<ValueT, int>::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename ValueT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(
     void* d_temp_storage,
@@ -237,6 +238,7 @@ struct DeviceSpmv
       num_nonzeros,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index dd161cf934..d4ae6ecddd 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -169,6 +169,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
       , stream(stream)
   {}
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference(
     void* d_temp_storage,
@@ -189,6 +190,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /// Invocation
   template <typename ActivePolicyT>
@@ -250,7 +252,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
                 init_grid_size,
                 init_block_size,
                 reinterpret_cast<long long>(stream));
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
           .doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
@@ -280,7 +282,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
               num_tiles,
               AdjacentDifferencePolicyT::BLOCK_THREADS,
               reinterpret_cast<long long>(stream));
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
         num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
@@ -354,6 +356,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -369,6 +372,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
 
     return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 05a82ec200..c623cda9a2 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -440,7 +440,7 @@ struct dispatch_histogram
               histogram_init_grid_dims,
               histogram_init_block_threads,
               (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke histogram_init_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -464,7 +464,7 @@ struct dispatch_histogram
               (long long) stream,
               pixels_per_thread,
               histogram_sweep_sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke histogram_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
@@ -657,9 +657,9 @@ public:
          ::cuda::std::is_same<CommonT, __uint128_t>::value), //
         CommonT, //
         uint64_t> //
-#else
+#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
       uint64_t
-#endif
+#endif // !CUB_IS_INT128_ENABLED
       >;
 
     // Alias template that excludes __[u]int128 from the integral types
@@ -669,9 +669,9 @@ public:
       ::cuda::std::_If<::cuda::std::is_same<T, __int128_t>::value&& ::cuda::std::is_same<T, __uint128_t>::value,
                        ::cuda::std::false_type,
                        ::cuda::std::is_integral<T>>;
-#else
+#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
       ::cuda::std::is_integral<T>;
-#endif
+#endif // !CUB_IS_INT128_ENABLED
 
     union ScaleT
     {
@@ -1036,6 +1036,7 @@ public:
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
     void* d_temp_storage,
@@ -1066,6 +1067,7 @@ public:
       stream,
       is_byte_sample);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * Dispatch routine for HistogramRange, specialized for 8-bit sample types
@@ -1200,6 +1202,7 @@ public:
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
     void* d_temp_storage,
@@ -1230,6 +1233,7 @@ public:
       stream,
       is_byte_sample);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
@@ -1416,6 +1420,7 @@ public:
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
     void* d_temp_storage,
@@ -1448,6 +1453,7 @@ public:
       stream,
       is_byte_sample);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * Dispatch routine for HistogramEven, specialized for 8-bit sample types
@@ -1586,6 +1592,7 @@ public:
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -1617,6 +1624,7 @@ public:
       stream,
       is_byte_sample);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 262bcc2623..3b3c0c903e 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -609,6 +609,7 @@ struct DispatchReduce : SelectedPolicy
       , transform_op(transform_op)
   {}
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce(
     void* d_temp_storage,
@@ -633,6 +634,7 @@ struct DispatchReduce : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //---------------------------------------------------------------------------
   // Small-problem (single tile) invocation
@@ -673,7 +675,7 @@ struct DispatchReduce : SelectedPolicy
               ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
               (long long) stream,
               ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke single_reduce_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -795,7 +797,7 @@ struct DispatchReduce : SelectedPolicy
               (long long) stream,
               ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
               reduce_config.sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke DeviceReduceKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -823,7 +825,7 @@ struct DispatchReduce : SelectedPolicy
               ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
               (long long) stream,
               ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke DeviceReduceSingleTileKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -977,6 +979,7 @@ struct DispatchReduce : SelectedPolicy
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -993,6 +996,7 @@ struct DispatchReduce : SelectedPolicy
 
     return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 /**
@@ -1151,6 +1155,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
       , ptx_version(ptx_version)
   {}
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce(
     void* d_temp_storage,
@@ -1179,6 +1184,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //---------------------------------------------------------------------------
   // Chained policy invocation
@@ -1231,7 +1237,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
               (long long) stream,
               ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
               segmented_reduce_config.sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke DeviceReduceKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -1379,6 +1385,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -1407,6 +1414,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
       init,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 8d5c3fb699..07dd492a53 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -348,7 +348,7 @@ struct DispatchReduceByKey
 
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -405,7 +405,7 @@ struct DispatchReduceByKey
                 (long long) stream,
                 items_per_thread,
                 reduce_by_key_sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke reduce_by_key_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
@@ -550,6 +550,7 @@ struct DispatchReduceByKey
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -580,6 +581,7 @@ struct DispatchReduceByKey
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 9c6c32a95c..917b5df37b 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -354,7 +354,7 @@ struct DeviceRleDispatch
               init_grid_size,
               INIT_KERNEL_THREADS,
               (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -415,7 +415,7 @@ struct DeviceRleDispatch
               (long long) stream,
               items_per_thread,
               device_rle_kernel_sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke device_rle_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
@@ -543,6 +543,7 @@ struct DeviceRleDispatch
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -568,6 +569,7 @@ struct DeviceRleDispatch
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index ac82b5cd2b..56c2be9611 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -330,6 +330,7 @@ struct DispatchScan : SelectedPolicy
       , ptx_version(ptx_version)
   {}
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan(
     void* d_temp_storage,
@@ -354,6 +355,7 @@ struct DispatchScan : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
@@ -426,7 +428,7 @@ struct DispatchScan : SelectedPolicy
 
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -478,7 +480,7 @@ struct DispatchScan : SelectedPolicy
                 (long long) stream,
                 Policy::ITEMS_PER_THREAD,
                 scan_sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke scan_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
@@ -591,6 +593,7 @@ struct DispatchScan : SelectedPolicy
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -607,6 +610,7 @@ struct DispatchScan : SelectedPolicy
 
     return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index eac364d77e..032554773a 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -339,6 +339,7 @@ struct DispatchScanByKey : SelectedPolicy
       , ptx_version(ptx_version)
   {}
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey(
     void* d_temp_storage,
@@ -367,6 +368,7 @@ struct DispatchScanByKey : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
@@ -436,7 +438,7 @@ struct DispatchScanByKey : SelectedPolicy
       int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -489,7 +491,7 @@ struct DispatchScanByKey : SelectedPolicy
                 (long long) stream,
                 Policy::ITEMS_PER_THREAD,
                 scan_sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke scan_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
@@ -631,6 +633,7 @@ struct DispatchScanByKey : SelectedPolicy
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -659,6 +662,7 @@ struct DispatchScanByKey : SelectedPolicy
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 84c81f34a9..702df00df3 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -548,7 +548,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
             static_cast<int>(blocks_in_grid),
             LargeSegmentPolicyT::BLOCK_THREADS,
             (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
       blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream)
@@ -596,7 +596,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
             static_cast<int>(small_and_medium_blocks_in_grid),
             SmallAndMediumPolicyT::BLOCK_THREADS,
             (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
       small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream)
@@ -1131,6 +1131,7 @@ struct DispatchSegmentedSort : SelectedPolicy
       , stream(stream)
   {}
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort(
     void* d_temp_storage,
@@ -1157,6 +1158,7 @@ struct DispatchSegmentedSort : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
@@ -1438,6 +1440,7 @@ struct DispatchSegmentedSort : SelectedPolicy
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -1466,6 +1469,7 @@ struct DispatchSegmentedSort : SelectedPolicy
       is_overwrite_okay,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 private:
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits)
@@ -1651,7 +1655,7 @@ private:
             (long long) stream,
             LargeSegmentPolicyT::ITEMS_PER_THREAD,
             LargeSegmentPolicyT::RADIX_BITS);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
     // Invoke fallback kernel
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 24c25b3679..50a2022184 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -467,7 +467,7 @@ struct DispatchSelectIf : SelectedPolicy
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog(
         "Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke scan_init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -530,7 +530,7 @@ struct DispatchSelectIf : SelectedPolicy
                 items_per_thread,
                 range_select_sm_occupancy);
       }
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke select_if_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
@@ -656,6 +656,7 @@ struct DispatchSelectIf : SelectedPolicy
     return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -684,6 +685,7 @@ struct DispatchSelectIf : SelectedPolicy
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index 95e51ebdba..0519dcc739 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -631,7 +631,7 @@ struct DispatchSpmv
                 blocks_in_grid,
                 threads_in_block,
                 (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
         error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
                   .doit(spmv_empty_matrix_kernel, spmv_params);
 
@@ -668,7 +668,7 @@ struct DispatchSpmv
                 degen_col_kernel_grid_size,
                 degen_col_kernel_block_size,
                 (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke spmv_search_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -795,7 +795,7 @@ struct DispatchSpmv
                 search_grid_size,
                 search_block_size,
                 (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke spmv_search_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream)
@@ -825,7 +825,7 @@ struct DispatchSpmv
               (long long) stream,
               spmv_config.items_per_thread,
               spmv_sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke spmv_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream)
@@ -863,7 +863,7 @@ struct DispatchSpmv
                 (long long) stream,
                 segment_fixup_config.items_per_thread,
                 segment_fixup_sm_occupancy);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke segment_fixup_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -893,6 +893,7 @@ struct DispatchSpmv
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   template <typename Spmv1ColKernelT,
             typename SpmvSearchKernelT,
             typename SpmvKernelT,
@@ -928,6 +929,7 @@ struct DispatchSpmv
       spmv_config,
       segment_fixup_config);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /**
    * @brief Internal dispatch routine for computing a device-wide reduction
@@ -988,6 +990,7 @@ struct DispatchSpmv
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -1000,6 +1003,7 @@ struct DispatchSpmv
 
     return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index 455e678436..be01d7508c 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -288,7 +288,7 @@ struct DispatchThreeWayPartitionIf
               init_grid_size,
               INIT_KERNEL_THREADS,
               reinterpret_cast<long long>(stream));
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke three_way_partition_init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -345,7 +345,7 @@ struct DispatchThreeWayPartitionIf
                 items_per_thread,
                 range_select_sm_occupancy);
       }
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke select_if_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
@@ -452,6 +452,7 @@ struct DispatchThreeWayPartitionIf
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -482,6 +483,7 @@ struct DispatchThreeWayPartitionIf
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index 1d097a93a0..c943034221 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -322,6 +322,7 @@ struct DispatchUniqueByKey : SelectedPolicy
       , stream(stream)
   {}
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey(
     void* d_temp_storage,
@@ -348,6 +349,7 @@ struct DispatchUniqueByKey : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
   /******************************************************************************
    * Dispatch entrypoints
@@ -425,7 +427,7 @@ struct DispatchUniqueByKey : SelectedPolicy
 
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -488,7 +490,7 @@ struct DispatchUniqueByKey : SelectedPolicy
                 items_per_thread,
                 scan_sm_occupancy);
       }
-#endif
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke select_if_kernel
       error =
@@ -629,6 +631,7 @@ struct DispatchUniqueByKey : SelectedPolicy
     return error;
   }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -657,6 +660,7 @@ struct DispatchUniqueByKey : SelectedPolicy
       num_items,
       stream);
   }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
 
 CUB_NAMESPACE_END

From a3a5f9c227c63c8f328ed2c181f0e935cc713eda Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Fri, 9 Aug 2024 10:46:24 -0700
Subject: [PATCH 15/33] CUDA `vector_add` sample project (#2160)

---------

Co-authored-by: pciolkosz <pciolkosz@nvidia.com>
Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 cudax/CMakeLists.txt                          |  13 +-
 cudax/cmake/cudaxBuildCompilerTargets.cmake   |   2 +-
 cudax/cmake/cudaxBuildTargetList.cmake        |   1 +
 .../cuda/experimental/__detail/utility.cuh    |  19 ++-
 .../cuda/experimental/__launch/param_kind.cuh |  85 ++++++++++
 cudax/include/cuda/experimental/launch.cuh    |   3 +
 cudax/samples/CMakeLists.txt                  |  76 +++++++++
 cudax/samples/cmake/CPM.cmake                 |  33 ++++
 cudax/samples/vector_add/param_kind.cuh       |  85 ++++++++++
 cudax/samples/vector_add/vector.cuh           | 151 ++++++++++++++++++
 cudax/samples/vector_add/vector_add.cu        | 127 +++++++++++++++
 11 files changed, 589 insertions(+), 6 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__launch/param_kind.cuh
 create mode 100755 cudax/samples/CMakeLists.txt
 create mode 100755 cudax/samples/cmake/CPM.cmake
 create mode 100644 cudax/samples/vector_add/param_kind.cuh
 create mode 100644 cudax/samples/vector_add/vector.cuh
 create mode 100644 cudax/samples/vector_add/vector_add.cu

diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt
index 4886562aca..f875cf8ebf 100644
--- a/cudax/CMakeLists.txt
+++ b/cudax/CMakeLists.txt
@@ -11,7 +11,7 @@ if (cudax_TOPLEVEL_PROJECT)
   cmake_minimum_required(VERSION 3.21)
 endif()
 
-project(cudax LANGUAGES CUDA)
+project(cudax LANGUAGES CUDA CXX)
 
 option(cudax_ENABLE_INSTALL_RULES "Enable installation of CUDA Experimental." ${cudax_TOPLEVEL_PROJECT})
 if (cudax_ENABLE_INSTALL_RULES)
@@ -25,6 +25,7 @@ endif()
 
 option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON)
 option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
+option(cudax_ENABLE_SAMPLES "Build CUDA Experimental's samples." ON)
 
 include(cmake/cudaxBuildCompilerTargets.cmake)
 include(cmake/cudaxBuildTargetList.cmake)
@@ -41,3 +42,13 @@ if (cudax_ENABLE_TESTING)
   enable_testing() # Must be in root directory
   add_subdirectory(test)
 endif()
+
+if (cudax_ENABLE_SAMPLES)
+  include(ExternalProject)
+  ExternalProject_Add(cudax_samples
+    PREFIX samples
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/samples"
+    BUILD_ALWAYS ON
+    INSTALL_COMMAND cmake -E echo "Skipping install step.")
+  add_dependencies(cudax.all cudax_samples)
+endif()
diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake
index 73aa9e376e..53cf7b8af4 100644
--- a/cudax/cmake/cudaxBuildCompilerTargets.cmake
+++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake
@@ -9,7 +9,7 @@
 include("${cudax_SOURCE_DIR}/cmake/AppendOptionIfAvailable.cmake")
 
 function(cudax_build_compiler_targets)
-  set(cxx_compile_definitions)
+  set(cxx_compile_definitions LIBCUDACXX_ENABLE_EXCEPTIONS)
   set(cxx_compile_options)
   set(cuda_compile_options)
 
diff --git a/cudax/cmake/cudaxBuildTargetList.cmake b/cudax/cmake/cudaxBuildTargetList.cmake
index 63284dbe4a..2be17393dc 100644
--- a/cudax/cmake/cudaxBuildTargetList.cmake
+++ b/cudax/cmake/cudaxBuildTargetList.cmake
@@ -176,6 +176,7 @@ function(cudax_build_target_list)
   file(GLOB_RECURSE all_sources
     RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
     "${cudax_SOURCE_DIR}/include/cuda/experimental/*.hpp"
+    "${cudax_SOURCE_DIR}/include/cuda/experimental/*.cuh"
   )
   add_custom_target(cudax.all SOURCES ${all_sources})
 
diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh
index 738a5d6244..1263ea880f 100644
--- a/cudax/include/cuda/experimental/__detail/utility.cuh
+++ b/cudax/include/cuda/experimental/__detail/utility.cuh
@@ -25,12 +25,23 @@ namespace cuda::experimental
 {
 namespace detail
 {
-struct __ignore
+// This is a helper type that can be used to ignore function arguments.
+struct [[maybe_unused]] __ignore
 {
-  template <typename... Args>
-  _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept
+  __ignore() = default;
+
+  template <typename _Arg>
+  _CCCL_HOST_DEVICE constexpr __ignore(_Arg&&) noexcept
   {}
 };
+
+// Classes can inherit from this type to become immovable.
+struct __immovable
+{
+  __immovable()                         = default;
+  __immovable(__immovable&&)            = delete;
+  __immovable& operator=(__immovable&&) = delete;
+};
 } // namespace detail
 
 struct uninit_t
@@ -38,7 +49,7 @@ struct uninit_t
   explicit uninit_t() = default;
 };
 
-inline constexpr uninit_t uninit{};
+_CCCL_GLOBAL_CONSTANT uninit_t uninit{};
 } // namespace cuda::experimental
 
 #endif // __CUDAX_DETAIL_UTILITY_H
diff --git a/cudax/include/cuda/experimental/__launch/param_kind.cuh b/cudax/include/cuda/experimental/__launch/param_kind.cuh
new file mode 100644
index 0000000000..d50ebe49d3
--- /dev/null
+++ b/cudax/include/cuda/experimental/__launch/param_kind.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__LAUNCH_PARAM_KIND
+#define _CUDAX__LAUNCH_PARAM_KIND
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/maybe_const.h>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+namespace cuda::experimental
+{
+namespace detail
+{
+enum class __param_kind : unsigned
+{
+  _in    = 1,
+  _out   = 2,
+  _inout = 3
+};
+
+_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept
+{
+  return __param_kind(unsigned(__a) & unsigned(__b));
+}
+
+template <typename _Ty, __param_kind _Kind>
+struct _CCCL_NODISCARD __box
+{
+  ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val;
+};
+
+struct __in_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __out_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __inout_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+} // namespace detail
+
+_CCCL_GLOBAL_CONSTANT detail::__in_t in{};
+_CCCL_GLOBAL_CONSTANT detail::__out_t out{};
+_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{};
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__LAUNCH_PARAM_KIND
diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh
index 69048248ef..0bac26aa01 100644
--- a/cudax/include/cuda/experimental/launch.cuh
+++ b/cudax/include/cuda/experimental/launch.cuh
@@ -11,6 +11,9 @@
 #ifndef __CUDAX_LAUNCH___
 #define __CUDAX_LAUNCH___
 
+#include <cuda/experimental/__launch/configuration.cuh>
 #include <cuda/experimental/__launch/launch.cuh>
+#include <cuda/experimental/__launch/launch_transform.cuh>
+#include <cuda/experimental/__launch/param_kind.cuh>
 
 #endif // __CUDAX_LAUNCH___
diff --git a/cudax/samples/CMakeLists.txt b/cudax/samples/CMakeLists.txt
new file mode 100755
index 0000000000..df0985c1ad
--- /dev/null
+++ b/cudax/samples/CMakeLists.txt
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+project(CUDAX_SAMPLES CUDA CXX)
+
+# This example uses the CMake Package Manager (CPM) to simplify fetching CCCL from GitHub
+# For more information, see https://github.com/cpm-cmake/CPM.cmake
+include(cmake/CPM.cmake)
+
+# We define these as variables so they can be overriden in CI to pull from a PR instead of CCCL `main`
+# In your project, these variables are unncessary and you can just use the values directly
+set(CCCL_REPOSITORY "nvidia/cccl" CACHE STRING "GitHub repository to fetch CCCL from")
+set(CCCL_TAG "main" CACHE STRING "Git tag/branch to fetch from CCCL repository")
+
+# This will automatically clone CCCL from GitHub and make the exported cmake targets available
+CPMAddPackage(
+  NAME CCCL
+  GITHUB_REPOSITORY ${CCCL_REPOSITORY}
+  GIT_TAG ${CCCL_TAG}
+  GIT_SHALLOW ON
+  OPTIONS "CCCL_ENABLE_UNSTABLE ON"
+)
+
+# Default to building for the GPU on the current system
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 86)
+endif()
+
+# Creates a cmake executable target for the main program
+add_executable(vector_add vector_add/vector_add.cu)
+
+# "Links" the CCCL::cudax CMake target to the `vector_add` executable. This
+# configures everything needed to use CCCL's headers, including setting up
+# include paths, compiler flags, etc.
+target_link_libraries(vector_add
+  PUBLIC
+    CCCL::cudax
+    CCCL::CCCL
+    CCCL::Thrust
+    CCCL::libcudacxx
+  INTERFACE cudax.compiler_interface
+)
+
+# TODO: These are temporary until the main branch catches up with the latest changes
+target_compile_definitions(vector_add PUBLIC LIBCUDACXX_ENABLE_EXCEPTIONS)
+
+if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  # mdspan on windows only works in C++20 mode
+  target_compile_features(vector_add PUBLIC cxx_std_20)
+
+  # cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require
+  # __cplusplus to be defined for this to work:
+  target_compile_options(vector_add PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus /Zc:preprocessor>
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=/Zc:__cplusplus -Xcompiler=/Zc:preprocessor>
+  )
+endif()
+
+# This is only relevant for internal testing and not needed by end users.
+include(CTest)
+enable_testing()
+add_test(NAME vector_add COMMAND vector_add)
diff --git a/cudax/samples/cmake/CPM.cmake b/cudax/samples/cmake/CPM.cmake
new file mode 100755
index 0000000000..a3086b791b
--- /dev/null
+++ b/cudax/samples/cmake/CPM.cmake
@@ -0,0 +1,33 @@
+set(CPM_DOWNLOAD_VERSION 0.38.1)
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+function(download_cpm)
+  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
+  file(DOWNLOAD
+       https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+       ${CPM_DOWNLOAD_LOCATION}
+  )
+endfunction()
+
+if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+  download_cpm()
+else()
+  # resume download if it previously failed
+  file(READ ${CPM_DOWNLOAD_LOCATION} check)
+  if("${check}" STREQUAL "")
+    download_cpm()
+  endif()
+  unset(check)
+endif()
+
+include(${CPM_DOWNLOAD_LOCATION})
diff --git a/cudax/samples/vector_add/param_kind.cuh b/cudax/samples/vector_add/param_kind.cuh
new file mode 100644
index 0000000000..d50ebe49d3
--- /dev/null
+++ b/cudax/samples/vector_add/param_kind.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__LAUNCH_PARAM_KIND
+#define _CUDAX__LAUNCH_PARAM_KIND
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/maybe_const.h>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+namespace cuda::experimental
+{
+namespace detail
+{
+enum class __param_kind : unsigned
+{
+  _in    = 1,
+  _out   = 2,
+  _inout = 3
+};
+
+_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept
+{
+  return __param_kind(unsigned(__a) & unsigned(__b));
+}
+
+template <typename _Ty, __param_kind _Kind>
+struct _CCCL_NODISCARD __box
+{
+  ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val;
+};
+
+struct __in_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __out_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __inout_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+} // namespace detail
+
+_CCCL_GLOBAL_CONSTANT detail::__in_t in{};
+_CCCL_GLOBAL_CONSTANT detail::__out_t out{};
+_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{};
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__LAUNCH_PARAM_KIND
diff --git a/cudax/samples/vector_add/vector.cuh b/cudax/samples/vector_add/vector.cuh
new file mode 100644
index 0000000000..7eef87f038
--- /dev/null
+++ b/cudax/samples/vector_add/vector.cuh
@@ -0,0 +1,151 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__CONTAINER_VECTOR
+#define _CUDAX__CONTAINER_VECTOR
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cuda/std/__type_traits/maybe_const.h>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+#include "param_kind.cuh"
+
+#if _CCCL_STD_VER >= 2017
+namespace cuda::experimental
+{
+using ::cuda::std::span;
+using ::thrust::device_vector;
+using ::thrust::host_vector;
+
+template <typename _Ty>
+class vector
+{
+public:
+  vector() = default;
+  explicit vector(size_t __n)
+      : __h_(__n)
+  {}
+
+  _Ty& operator[](size_t __i) noexcept
+  {
+    __dirty_ = true;
+    return __h_[__i];
+  }
+
+  const _Ty& operator[](size_t __i) const noexcept
+  {
+    return __h_[__i];
+  }
+
+private:
+  void sync_host_to_device(stream_ref __str, detail::__param_kind __p) const
+  {
+    if (__dirty_)
+    {
+      if (__p == detail::__param_kind::_out)
+      {
+        // There's no need to copy the data from host to device if the data is
+        // only going to be written to. We can just allocate the device memory.
+        __d_.resize(__h_.size());
+      }
+      else
+      {
+        // TODO: use a memcpy async here
+        __d_ = __h_;
+      }
+      __dirty_ = false;
+    }
+  }
+
+  void sync_device_to_host(stream_ref __str, detail::__param_kind __p) const
+  {
+    if (__p != detail::__param_kind::_in)
+    {
+      // TODO: use a memcpy async here
+      __str.wait(); // wait for the kernel to finish executing
+      __h_ = __d_;
+    }
+  }
+
+  template <detail::__param_kind _Kind>
+  class __action //: private detail::__immovable
+  {
+    using __cv_vector = ::cuda::std::__maybe_const<_Kind == detail::__param_kind::_in, vector>;
+
+  public:
+    explicit __action(stream_ref __str, __cv_vector& __v) noexcept
+        : __str_(__str)
+        , __v_(__v)
+    {
+      __v_.sync_host_to_device(__str_, _Kind);
+    }
+
+    __action(__action&&) = delete;
+
+    ~__action()
+    {
+      __v_.sync_device_to_host(__str_, _Kind);
+    }
+
+    using __as_kernel_arg = ::cuda::std::span<_Ty>;
+
+    operator ::cuda::std::span<_Ty>()
+    {
+      return {__v_.__d_.data().get(), __v_.__d_.size()};
+    }
+
+  private:
+    stream_ref __str_;
+    __cv_vector& __v_;
+  };
+
+  _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_inout>
+  __cudax_launch_transform(stream_ref __str, vector& __v) noexcept
+  {
+    return __action<detail::__param_kind::_inout>{__str, __v};
+  }
+
+  _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_in>
+  __cudax_launch_transform(stream_ref __str, const vector& __v) noexcept
+  {
+    return __action<detail::__param_kind::_in>{__str, __v};
+  }
+
+  template <detail::__param_kind _Kind>
+  _CCCL_NODISCARD_FRIEND __action<_Kind>
+  __cudax_launch_transform(stream_ref __str, detail::__box<vector, _Kind> __b) noexcept
+  {
+    return __action<_Kind>{__str, __b.__val};
+  }
+
+  mutable host_vector<_Ty> __h_;
+  mutable device_vector<_Ty> __d_{};
+  mutable bool __dirty_ = true;
+};
+
+} // namespace cuda::experimental
+
+#endif
+#endif
diff --git a/cudax/samples/vector_add/vector_add.cu b/cudax/samples/vector_add/vector_add.cu
new file mode 100644
index 0000000000..784997e23d
--- /dev/null
+++ b/cudax/samples/vector_add/vector_add.cu
@@ -0,0 +1,127 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <stdio.h>
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+#include <cuda/std/span>
+
+#include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include "vector.cuh"
+
+namespace cudax = cuda::experimental;
+using cudax::in;
+using cudax::out;
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void vectorAdd(cudax::span<const float> A, cudax::span<const float> B, cudax::span<float> C)
+{
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < A.size())
+  {
+    C[i] = A[i] + B[i] + 0.0f;
+  }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+try
+{
+  // A CUDA stream on which to execute the vector addition kernel
+  cudax::stream stream(cudax::devices[0]);
+
+  // Print the vector length to be used, and compute its size
+  int numElements = 50000;
+  printf("[Vector addition of %d elements]\n", numElements);
+
+  // Allocate the host vectors
+  cudax::vector<float> A(numElements); // input
+  cudax::vector<float> B(numElements); // input
+  cudax::vector<float> C(numElements); // output
+
+  // Initialize the host input vectors
+  for (int i = 0; i < numElements; ++i)
+  {
+    A[i] = rand() / (float) RAND_MAX;
+    B[i] = rand() / (float) RAND_MAX;
+  }
+
+  // Define the kernel launch parameters
+  constexpr int threadsPerBlock = 256;
+  auto dims                     = cudax::distribute<threadsPerBlock>(numElements);
+
+  // Launch the vectorAdd kernel
+  printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock);
+  cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C));
+
+  printf("waiting for the stream to finish\n");
+  stream.wait();
+
+  printf("veryfying the results\n");
+  // Verify that the result vector is correct
+  for (int i = 0; i < numElements; ++i)
+  {
+    if (fabs(A[i] + B[i] - C[i]) > 1e-5)
+    {
+      fprintf(stderr, "Result verification failed at element %d!\n", i);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  printf("Test PASSED\n");
+
+  printf("Done\n");
+  return 0;
+}
+catch (const std::exception& e)
+{
+  printf("caught an exception: \"%s\"\n", e.what());
+}
+catch (...)
+{
+  printf("caught an unknown exception\n");
+}

From 6ee3415a8d0eea82d0d6f9915aa249a6ceb13e24 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Mon, 12 Aug 2024 10:23:40 -0700
Subject: [PATCH 16/33] avoid constraint recursion in the `resource` concept
 (#2215)

drive-by: avoid potential overload ambiguity in `__launch_transform`
---
 .../__launch/launch_transform.cuh             | 34 +++++++++----------
 .../cuda_managed_memory_resource.h            |  6 ++--
 .../cuda_pinned_memory_resource.h             |  6 ++--
 .../include/cuda/__memory_resource/resource.h | 24 +++++++++++--
 .../equality.pass.cpp                         |  7 ++++
 .../cuda_memory_resource/equality.pass.cpp    |  7 ++++
 .../equality.pass.cpp                         |  7 ++++
 7 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh
index 4692cf9376..b131ccdfaa 100644
--- a/cudax/include/cuda/experimental/__launch/launch_transform.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch_transform.cuh
@@ -32,17 +32,7 @@ namespace cuda::experimental
 namespace detail
 {
 // Types should define overloads of __cudax_launch_transform that are find-able
-// by ADL in order to customize how cudax::launch handles that type. The
-// overload below, which simply returns the argument unmodified, is the overload
-// that gets chosen if no other overload matches. It takes __ignore as the first
-// argument to make this overload less preferred than other overloads that take
-// a stream_ref as the first argument.
-template <typename _Arg>
-_CCCL_NODISCARD constexpr _Arg&& __cudax_launch_transform(__ignore, _Arg&& __arg) noexcept
-{
-  return _CUDA_VSTD::forward<_Arg>(__arg);
-}
-
+// by ADL in order to customize how cudax::launch handles that type.
 template <typename _Arg>
 using __launch_transform_direct_result_t =
   decltype(__cudax_launch_transform(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>()));
@@ -50,25 +40,35 @@ using __launch_transform_direct_result_t =
 struct __fn
 {
   template <typename _Arg>
-  _CCCL_NODISCARD __launch_transform_direct_result_t<_Arg> operator()(::cuda::stream_ref __stream, _Arg&& __arg) const
+  _CCCL_NODISCARD decltype(auto) operator()(::cuda::stream_ref __stream, _Arg&& __arg) const
   {
-    // This call is unqualified to allow ADL
-    return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg));
+    if constexpr (::cuda::std::_IsValidExpansion<__launch_transform_direct_result_t, _Arg>::value)
+    {
+      // This call is unqualified to allow ADL
+      return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg));
+    }
+    else
+    {
+      return _CUDA_VSTD::forward<_Arg>(__arg);
+    }
   }
 };
 
+template <typename _Arg>
+using __launch_transform_result_t = decltype(__fn{}(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>()));
+
 template <typename _Arg, typename _Enable = void>
 struct __as_kernel_arg
 {
-  using type = _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>;
+  using type = _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>;
 };
 
 template <typename _Arg>
 struct __as_kernel_arg<
   _Arg,
-  _CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg>>
+  _CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg>>
 {
-  using type = typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg;
+  using type = typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg;
 };
 
 _CCCL_GLOBAL_CONSTANT __fn __launch_transform{};
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
index 75ba16bd05..a8a42841de 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
@@ -80,7 +80,7 @@ class cuda_managed_memory_resource
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to cuda_memory_resource::deallocate.");
+                       "Invalid alignment passed to cuda_managed_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFree, "cuda_managed_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }
@@ -102,8 +102,8 @@ class cuda_managed_memory_resource
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  //! @brief Equality comparison between a \c cuda_memory_resource and another resource
-  //! @param __lhs The \c cuda_memory_resource
+  //! @brief Equality comparison between a \c cuda_managed_memory_resource and another resource
+  //! @param __lhs The \c cuda_managed_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
index ac7fd07b96..f8fc3a25ce 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
@@ -82,7 +82,7 @@ class cuda_pinned_memory_resource
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to cuda_memory_resource::deallocate.");
+                       "Invalid alignment passed to cuda_pinned_memory_resource::deallocate.");
     _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "cuda_pinned_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }
@@ -104,8 +104,8 @@ class cuda_pinned_memory_resource
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  //! @brief Equality comparison between a \c cuda_memory_resource and another resource
-  //! @param __lhs The \c cuda_memory_resource
+  //! @brief Equality comparison between a \c cuda_pinned_memory_resource and another resource
+  //! @param __lhs The \c cuda_pinned_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
diff --git a/libcudacxx/include/cuda/__memory_resource/resource.h b/libcudacxx/include/cuda/__memory_resource/resource.h
index 8328d9809c..0692269b80 100644
--- a/libcudacxx/include/cuda/__memory_resource/resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/resource.h
@@ -25,6 +25,7 @@
 
 #  include <cuda/__memory_resource/get_property.h>
 #  include <cuda/std/__concepts/all_of.h>
+#  include <cuda/std/__concepts/convertible_to.h>
 #  include <cuda/std/__concepts/equality_comparable.h>
 #  include <cuda/std/__concepts/same_as.h>
 #  include <cuda/std/__type_traits/decay.h>
@@ -99,10 +100,29 @@ template <class _Resource, class... _Properties>
 _LIBCUDACXX_CONCEPT async_resource_with =
   async_resource<_Resource> && _CUDA_VSTD::__all_of<has_property<_Resource, _Properties>...>;
 
+template <bool _Convertible>
+struct __different_resource__
+{
+  template <class _OtherResource>
+  static constexpr bool __value(_OtherResource*) noexcept
+  {
+    return resource<_OtherResource>;
+  }
+};
+
+template <>
+struct __different_resource__<true>
+{
+  static constexpr bool __value(void*) noexcept
+  {
+    return false;
+  }
+};
+
 template <class _Resource, class _OtherResource>
 _LIBCUDACXX_CONCEPT __different_resource =
-  (!_CUDA_VSTD::same_as<_CUDA_VSTD::decay_t<_Resource>, _CUDA_VSTD::decay_t<_OtherResource>>)
-  && resource<_OtherResource>;
+  __different_resource__<_CUDA_VSTD::convertible_to<_OtherResource const&, _Resource const&>>::__value(
+    static_cast<_OtherResource*>(nullptr));
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp
index 80fb2ab57e..f2e14578f7 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp
@@ -56,6 +56,13 @@ struct async_resource : public resource<Accessibilty>
 static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
 static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
 
+// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
+struct derived_managed_resource : cuda::mr::cuda_managed_memory_resource
+{
+  using cuda::mr::cuda_managed_memory_resource::cuda_managed_memory_resource;
+};
+static_assert(cuda::mr::resource<derived_managed_resource>, "");
+
 void test()
 {
   cuda::mr::cuda_managed_memory_resource first{};
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp
index 94d659f90f..50fd7476ba 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp
@@ -66,6 +66,13 @@ static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>
 static_assert(cuda::mr::async_resource_with<async_resource<AccessibilityType::Device>, cuda::mr::device_accessible>,
               "");
 
+// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
+struct derived_resource : cuda::mr::cuda_memory_resource
+{
+  using cuda::mr::cuda_memory_resource::cuda_memory_resource;
+};
+static_assert(cuda::mr::resource<derived_resource>, "");
+
 // Ensure that we can only
 
 void test()
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp
index 7cab309a33..dd480cc9f7 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp
@@ -56,6 +56,13 @@ struct async_resource : public resource<Accessibilty>
 static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
 static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
 
+// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
+struct derived_pinned_resource : cuda::mr::cuda_pinned_memory_resource
+{
+  using cuda::mr::cuda_pinned_memory_resource::cuda_pinned_memory_resource;
+};
+static_assert(cuda::mr::resource<derived_pinned_resource>, "");
+
 void test()
 {
   cuda::mr::cuda_pinned_memory_resource first{};

From aaf134000d9d368b134dc27f1e881ff36a694fa1 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Tue, 13 Aug 2024 03:41:26 -0700
Subject: [PATCH 17/33] fix `cuda_memory_resource` test for properly aligned
 memory (#2227)

---
 .../memory_resource/cuda_memory_resource/allocate.pass.cpp     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp
index 2c88483e6c..073de36074 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp
@@ -47,7 +47,8 @@ void test()
     ensure_device_ptr(ptr);
 
     // also check the alignment
-    const auto alignment = reinterpret_cast<cuda::std::uintptr_t>(ptr);
+    const auto address   = reinterpret_cast<cuda::std::uintptr_t>(ptr);
+    const auto alignment = address & (~address + 1ULL);
     assert(alignment >= desired_alignment);
     res.deallocate(ptr, 42, desired_alignment);
   }

From 098fb29af4555e601fe703234dddadcbb52b0713 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Tue, 13 Aug 2024 09:33:10 -0700
Subject: [PATCH 18/33] Fix including `<complex>` when bad CUDA bfloat/half
 macros are used. (#2226)

* Add <complex> test for bad macros being defined

* Fix <complex> failing upon inclusion when bad macros are defined

* Rather use explicit specializations and some evil hackery to get the complex interop to work

* Fix typos

* Inline everything

* Move workarounds together

* Use conversion functions instead of explicit specializations

* Drop unneeded conversions

---------

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .../include/cuda/std/__complex/nvbf16.h       | 87 ++++++++++++++++---
 .../include/cuda/std/__complex/nvfp16.h       | 87 ++++++++++++++++---
 .../half_bfloat/complex.bad_macros.pass.cpp   | 51 +++++++++++
 3 files changed, 203 insertions(+), 22 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp

diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h
index d90c30e221..612ebba335 100644
--- a/libcudacxx/include/cuda/std/__complex/nvbf16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h
@@ -63,6 +63,39 @@ struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false>
   typedef complex<__nv_bfloat16> _ComplexType;
 };
 
+// This is a workaround against the user defining macros __CUDA_NO_BFLOAT16_CONVERSIONS__ __CUDA_NO_BFLOAT16_OPERATORS__
+template <>
+struct __complex_can_implicitly_construct<__nv_bfloat16, float> : true_type
+{};
+
+template <>
+struct __complex_can_implicitly_construct<__nv_bfloat16, double> : true_type
+{};
+
+template <>
+struct __complex_can_implicitly_construct<float, __nv_bfloat16> : true_type
+{};
+
+template <>
+struct __complex_can_implicitly_construct<double, __nv_bfloat16> : true_type
+{};
+
+template <class _Tp>
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const _Tp& __value) noexcept
+{
+  return __value;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const float& __value) noexcept
+{
+  return __float2bfloat16(__value);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const double& __value) noexcept
+{
+  return __double2bfloat16(__value);
+}
+
 template <>
 class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__nv_bfloat16>
 {
@@ -80,14 +113,14 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
 
   template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY complex(const complex<_Up>& __c)
-      : __repr_(static_cast<value_type>(__c.real()), static_cast<value_type>(__c.imag()))
+      : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
   {}
 
   template <class _Up,
             __enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
             __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>              = 0>
   _LIBCUDACXX_INLINE_VISIBILITY explicit complex(const complex<_Up>& __c)
-      : __repr_(static_cast<value_type>(__c.real()), static_cast<value_type>(__c.imag()))
+      : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
   {}
 
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const value_type& __re)
@@ -100,8 +133,8 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
   template <class _Up>
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const complex<_Up>& __c)
   {
-    __repr_.x = __c.real();
-    __repr_.y = __c.imag();
+    __repr_.x = __convert_to_bfloat16(__c.real());
+    __repr_.y = __convert_to_bfloat16(__c.imag());
     return *this;
   }
 
@@ -155,24 +188,24 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
 
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator+=(const value_type& __re)
   {
-    __repr_.x += __re;
+    __repr_.x = __hadd(__repr_.x, __re);
     return *this;
   }
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator-=(const value_type& __re)
   {
-    __repr_.x -= __re;
+    __repr_.x = __hsub(__repr_.x, __re);
     return *this;
   }
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator*=(const value_type& __re)
   {
-    __repr_.x *= __re;
-    __repr_.y *= __re;
+    __repr_.x = __hmul(__repr_.x, __re);
+    __repr_.y = __hmul(__repr_.y, __re);
     return *this;
   }
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator/=(const value_type& __re)
   {
-    __repr_.x /= __re;
-    __repr_.y /= __re;
+    __repr_.x = __hdiv(__repr_.x, __re);
+    __repr_.y = __hdiv(__repr_.y, __re);
     return *this;
   }
 
@@ -195,9 +228,41 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
   }
 };
 
+template <> // complex<float>
+template <> // complex<__half>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>::complex(const complex<__nv_bfloat16>& __c)
+    : __re_(__bfloat162float(__c.real()))
+    , __im_(__bfloat162float(__c.imag()))
+{}
+
+template <> // complex<double>
+template <> // complex<__half>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>::complex(const complex<__nv_bfloat16>& __c)
+    : __re_(__bfloat162float(__c.real()))
+    , __im_(__bfloat162float(__c.imag()))
+{}
+
+template <> // complex<float>
+template <> // complex<__nv_bfloat16>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>& complex<float>::operator=(const complex<__nv_bfloat16>& __c)
+{
+  __re_ = __bfloat162float(__c.real());
+  __im_ = __bfloat162float(__c.imag());
+  return *this;
+}
+
+template <> // complex<double>
+template <> // complex<__nv_bfloat16>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>& complex<double>::operator=(const complex<__nv_bfloat16>& __c)
+{
+  __re_ = __bfloat162float(__c.real());
+  __im_ = __bfloat162float(__c.imag());
+  return *this;
+}
+
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 arg(__nv_bfloat16 __re)
 {
-  return _CUDA_VSTD::atan2f(__nv_bfloat16(0), __re);
+  return _CUDA_VSTD::atan2(__int2bfloat16_rn(0), __re);
 }
 
 // We have performance issues with some trigonometric functions with __nv_bfloat16
diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h
index 7bd0ea0277..b3154a4b23 100644
--- a/libcudacxx/include/cuda/std/__complex/nvfp16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h
@@ -60,6 +60,39 @@ struct __libcpp_complex_overload_traits<__half, false, false>
   typedef complex<__half> _ComplexType;
 };
 
+// This is a workaround against the user defining macros __CUDA_NO_HALF_CONVERSIONS__ __CUDA_NO_HALF_OPERATORS__
+template <>
+struct __complex_can_implicitly_construct<__half, float> : true_type
+{};
+
+template <>
+struct __complex_can_implicitly_construct<__half, double> : true_type
+{};
+
+template <>
+struct __complex_can_implicitly_construct<float, __half> : true_type
+{};
+
+template <>
+struct __complex_can_implicitly_construct<double, __half> : true_type
+{};
+
+template <class _Tp>
+inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const _Tp& __value) noexcept
+{
+  return __value;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const float& __value) noexcept
+{
+  return __float2half(__value);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const double& __value) noexcept
+{
+  return __double2half(__value);
+}
+
 template <>
 class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
 {
@@ -77,14 +110,14 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
 
   template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY complex(const complex<_Up>& __c)
-      : __repr_(static_cast<value_type>(__c.real()), static_cast<value_type>(__c.imag()))
+      : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
   {}
 
   template <class _Up,
             __enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
             __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>              = 0>
   _LIBCUDACXX_INLINE_VISIBILITY explicit complex(const complex<_Up>& __c)
-      : __repr_(static_cast<value_type>(__c.real()), static_cast<value_type>(__c.imag()))
+      : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
   {}
 
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const value_type& __re)
@@ -97,8 +130,8 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
   template <class _Up>
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const complex<_Up>& __c)
   {
-    __repr_.x = __c.real();
-    __repr_.y = __c.imag();
+    __repr_.x = __convert_to_half(__c.real());
+    __repr_.y = __convert_to_half(__c.imag());
     return *this;
   }
 
@@ -152,24 +185,24 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
 
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator+=(const value_type& __re)
   {
-    __repr_.x += __re;
+    __repr_.x = __hadd(__repr_.x, __re);
     return *this;
   }
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator-=(const value_type& __re)
   {
-    __repr_.x -= __re;
+    __repr_.x = __hsub(__repr_.x, __re);
     return *this;
   }
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator*=(const value_type& __re)
   {
-    __repr_.x *= __re;
-    __repr_.y *= __re;
+    __repr_.x = __hmul(__repr_.x, __re);
+    __repr_.y = __hmul(__repr_.y, __re);
     return *this;
   }
   _LIBCUDACXX_INLINE_VISIBILITY complex& operator/=(const value_type& __re)
   {
-    __repr_.x /= __re;
-    __repr_.y /= __re;
+    __repr_.x = __hdiv(__repr_.x, __re);
+    __repr_.y = __hdiv(__repr_.y, __re);
     return *this;
   }
 
@@ -192,9 +225,41 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
   }
 };
 
+template <> // complex<float>
+template <> // complex<__half>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>::complex(const complex<__half>& __c)
+    : __re_(__half2float(__c.real()))
+    , __im_(__half2float(__c.imag()))
+{}
+
+template <> // complex<double>
+template <> // complex<__half>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>::complex(const complex<__half>& __c)
+    : __re_(__half2float(__c.real()))
+    , __im_(__half2float(__c.imag()))
+{}
+
+template <> // complex<float>
+template <> // complex<__half>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>& complex<float>::operator=(const complex<__half>& __c)
+{
+  __re_ = __half2float(__c.real());
+  __im_ = __half2float(__c.imag());
+  return *this;
+}
+
+template <> // complex<double>
+template <> // complex<__half>
+inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>& complex<double>::operator=(const complex<__half>& __c)
+{
+  __re_ = __half2float(__c.real());
+  __im_ = __half2float(__c.imag());
+  return *this;
+}
+
 inline _LIBCUDACXX_INLINE_VISIBILITY __half arg(__half __re)
 {
-  return _CUDA_VSTD::atan2f(__half(0), __re);
+  return _CUDA_VSTD::atan2(__int2half_rn(0), __re);
 }
 
 // We have performance issues with some trigonometric functions with __half
diff --git a/libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp b/libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp
new file mode 100644
index 0000000000..0bd9da2fad
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/complex/half_bfloat/complex.bad_macros.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#define __CUDA_NO_HALF_CONVERSIONS__     1
+#define __CUDA_NO_HALF_OPERATORS__       1
+#define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1
+#define __CUDA_NO_BFLOAT16_OPERATORS__   1
+#define __CUDA_NO_HALF2_OPERATORS__      1
+#define __CUDA_NO_BFLOAT162_OPERATORS__  1
+
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+
+#include "test_macros.h"
+
+template <class T, class U>
+__host__ __device__ void test_assignment(cuda::std::complex<U> v = {})
+{
+  cuda::std::complex<T> converting(v);
+
+  cuda::std::complex<T> assigning{};
+  assigning = v;
+}
+
+__host__ __device__ void test()
+{
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_assignment<__half, float>();
+  test_assignment<__half, double>();
+  test_assignment<float, __half>();
+  test_assignment<double, __half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_assignment<__nv_bfloat16, float>();
+  test_assignment<__nv_bfloat16, double>();
+  test_assignment<float, __nv_bfloat16>();
+  test_assignment<double, __nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
+}
+
+int main(int arg, char** argv)
+{
+  test();
+  return 0;
+}

From d7c83fe654dd0e879b043c046ebe451614eb44eb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 14 Aug 2024 00:49:20 +0800
Subject: [PATCH 19/33] add license & fix long_description (#2211)

---
 python/cuda/README.md |  8 ++++++++
 python/cuda/setup.py  | 10 +++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/cuda/README.md b/python/cuda/README.md
index 58e2f908bb..e57f06e6b4 100644
--- a/python/cuda/README.md
+++ b/python/cuda/README.md
@@ -1,3 +1,11 @@
+# `cuda.cooperative`: Experimental CUDA Core Compute Library for Python
+
+## Documentation
+
+Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
+
+## Local development
+
 ```bash
 pip3 install -e .[test]
 pytest -v ./tests/device/
diff --git a/python/cuda/setup.py b/python/cuda/setup.py
index e76d585162..f7eff80bda 100644
--- a/python/cuda/setup.py
+++ b/python/cuda/setup.py
@@ -25,6 +25,10 @@
 del __version__
 
 
+with open("README.md") as f:
+    long_description = f.read()
+
+
 class CustomBuildCommand(build_py):
     def run(self):
         self.run_command('package_cccl')
@@ -62,6 +66,8 @@ def run(self):
     name="cuda-cooperative",
     version=ver,
     description="Experimental Core Library for CUDA Python",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
     author="NVIDIA Corporation",
     classifiers=[
         "Programming Language :: Python :: 3 :: Only",
@@ -85,5 +91,7 @@ def run(self):
         'build_py': CustomBuildCommand,
         'bdist_wheel': CustomWheelBuild,
     },
-    include_package_data=True
+    include_package_data=True,
+    license="Apache-2.0 with LLVM exception",
+    license_files = ('../../LICENSE',),
 )

From 64d28d1c3caaf22bd5a044db9317b2c8c6c70d7a Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 13 Aug 2024 17:02:57 -0700
Subject: [PATCH 20/33] Extract reduction kernels into NVRTC-compilable header
 (#2231)

---
 cub/cub/agent/agent_reduce.cuh              |   8 +-
 cub/cub/device/dispatch/dispatch_reduce.cuh | 228 +----------------
 cub/cub/device/dispatch/kernels/reduce.cuh  | 268 ++++++++++++++++++++
 cub/test/catch2_test_nvrtc.cu               |   1 +
 4 files changed, 273 insertions(+), 232 deletions(-)
 create mode 100644 cub/cub/device/dispatch/kernels/reduce.cuh

diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
index 3492bd5f41..94b90774e5 100644
--- a/cub/cub/agent/agent_reduce.cuh
+++ b/cub/cub/agent/agent_reduce.cuh
@@ -53,8 +53,6 @@
 
 #include <cuda/std/type_traits>
 
-#include <iterator>
-
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 #include <cuda/std/functional>
 _CCCL_SUPPRESS_DEPRECATED_POP
@@ -147,7 +145,7 @@ struct AgentReduce
   // Wrap the native input pointer with CacheModifiedInputIterator
   // or directly use the supplied input iterator type
   using WrappedInputIteratorT =
-    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+    ::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
                      CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,
                      InputIteratorT>;
 
@@ -160,8 +158,8 @@ struct AgentReduce
   // Can vectorize according to the policy if the input iterator is a native
   // pointer to a primitive type
   static constexpr bool ATTEMPT_VECTORIZATION =
-    (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && (std::is_pointer<InputIteratorT>::value)
-    && Traits<InputT>::PRIMITIVE;
+    (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0)
+    && (::cuda::std::is_pointer<InputIteratorT>::value) && Traits<InputT>::PRIMITIVE;
 
   static constexpr CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER;
 
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 3b3c0c903e..e3e3844a3f 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -45,6 +45,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_reduce.cuh>
+#include <cub/device/dispatch/kernels/reduce.cuh>
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -66,233 +67,6 @@ _CCCL_SUPPRESS_DEPRECATED_POP
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
-{
-namespace reduce
-{
-
-/**
- * All cub::DeviceReduce::* algorithms are using the same implementation. Some of them, however,
- * should use initial value only for empty problems. If this struct is used as initial value with
- * one of the `DeviceReduce` algorithms, the `init` value wrapped by this struct will only be used
- * for empty problems; it will not be incorporated into the aggregate of non-empty problems.
- */
-template <class T>
-struct empty_problem_init_t
-{
-  T init;
-
-  _CCCL_HOST_DEVICE operator T() const
-  {
-    return init;
-  }
-};
-
-/**
- * @brief Applies initial value to the block aggregate and stores the result to the output iterator.
- *
- * @param d_out Iterator to the output aggregate
- * @param reduction_op Binary reduction functor
- * @param init Initial value
- * @param block_aggregate Aggregate value computed by the block
- */
-template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
-_CCCL_HOST_DEVICE void
-finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitT init, AccumT block_aggregate)
-{
-  *d_out = reduction_op(init, block_aggregate);
-}
-
-/**
- * @brief Ignores initial value and stores the block aggregate to the output iterator.
- *
- * @param d_out Iterator to the output aggregate
- * @param block_aggregate Aggregate value computed by the block
- */
-template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
-_CCCL_HOST_DEVICE void
-finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t<InitT>, AccumT block_aggregate)
-{
-  *d_out = block_aggregate;
-}
-} // namespace reduce
-} // namespace detail
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * @brief Reduce region kernel entry point (multi-block). Computes privatized
- *        reductions, one per thread block.
- *
- * @tparam ChainedPolicyT
- *   Chained tuning policy
- *
- * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items @iterator
- *
- * @tparam OffsetT
- *   Signed integer type for global offsets
- *
- * @tparam ReductionOpT
- *   Binary reduction functor type having member
- *   `auto operator()(const T &a, const U &b)`
- *
- * @tparam InitT
- *   Initial value type
- *
- * @tparam AccumT
- *   Accumulator type
- *
- * @param[in] d_in
- *   Pointer to the input sequence of data items
- *
- * @param[out] d_out
- *   Pointer to the output aggregate
- *
- * @param[in] num_items
- *   Total number of input data items
- *
- * @param[in] even_share
- *   Even-share descriptor for mapping an equal number of tiles onto each
- *   thread block
- *
- * @param[in] reduction_op
- *   Binary reduction functor
- */
-template <typename ChainedPolicyT,
-          typename InputIteratorT,
-          typename OffsetT,
-          typename ReductionOpT,
-          typename AccumT,
-          typename TransformOpT>
-CUB_DETAIL_KERNEL_ATTRIBUTES
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel(
-  InputIteratorT d_in,
-  AccumT* d_out,
-  OffsetT num_items,
-  GridEvenShare<OffsetT> even_share,
-  ReductionOpT reduction_op,
-  TransformOpT transform_op)
-{
-  // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-                InputIteratorT,
-                AccumT*,
-                OffsetT,
-                ReductionOpT,
-                AccumT,
-                TransformOpT>;
-
-  // Shared memory storage
-  __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-  // Consume input tiles
-  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeTiles(even_share);
-
-  // Output result
-  if (threadIdx.x == 0)
-  {
-    detail::uninitialized_copy_single(d_out + blockIdx.x, block_aggregate);
-  }
-}
-
-/**
- * @brief Reduce a single tile kernel entry point (single-block). Can be used
- *        to aggregate privatized thread block reductions from a previous
- *        multi-block reduction pass.
- *
- * @tparam ChainedPolicyT
- *   Chained tuning policy
- *
- * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items @iterator
- *
- * @tparam OutputIteratorT
- *   Output iterator type for recording the reduced aggregate @iterator
- *
- * @tparam OffsetT
- *   Signed integer type for global offsets
- *
- * @tparam ReductionOpT
- *   Binary reduction functor type having member
- *   `T operator()(const T &a, const U &b)`
- *
- * @tparam InitT
- *   Initial value type
- *
- * @tparam AccumT
- *   Accumulator type
- *
- * @param[in] d_in
- *   Pointer to the input sequence of data items
- *
- * @param[out] d_out
- *   Pointer to the output aggregate
- *
- * @param[in] num_items
- *   Total number of input data items
- *
- * @param[in] reduction_op
- *   Binary reduction functor
- *
- * @param[in] init
- *   The initial value of the reduction
- */
-template <typename ChainedPolicyT,
-          typename InputIteratorT,
-          typename OutputIteratorT,
-          typename OffsetT,
-          typename ReductionOpT,
-          typename InitT,
-          typename AccumT,
-          typename TransformOpT = ::cuda::std::__identity>
-CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
-  int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
-  1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
-                                       OutputIteratorT d_out,
-                                       OffsetT num_items,
-                                       ReductionOpT reduction_op,
-                                       InitT init,
-                                       TransformOpT transform_op)
-{
-  // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
-                InputIteratorT,
-                OutputIteratorT,
-                OffsetT,
-                ReductionOpT,
-                AccumT,
-                TransformOpT>;
-
-  // Shared memory storage
-  __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-  // Check if empty problem
-  if (num_items == 0)
-  {
-    if (threadIdx.x == 0)
-    {
-      *d_out = init;
-    }
-
-    return;
-  }
-
-  // Consume input tiles
-  AccumT block_aggregate =
-    AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeRange(OffsetT(0), num_items);
-
-  // Output result
-  if (threadIdx.x == 0)
-  {
-    detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate);
-  }
-}
-
 /// Normalize input iterator to segment offset
 template <typename T, typename OffsetT, typename IteratorT>
 _CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/)
diff --git a/cub/cub/device/dispatch/kernels/reduce.cuh b/cub/cub/device/dispatch/kernels/reduce.cuh
new file mode 100644
index 0000000000..174b262c39
--- /dev/null
+++ b/cub/cub/device/dispatch/kernels/reduce.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_reduce.cuh>
+#include <cub/grid/grid_even_share.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace reduce
+{
+
+/**
+ * All cub::DeviceReduce::* algorithms are using the same implementation. Some of them, however,
+ * should use initial value only for empty problems. If this struct is used as initial value with
+ * one of the `DeviceReduce` algorithms, the `init` value wrapped by this struct will only be used
+ * for empty problems; it will not be incorporated into the aggregate of non-empty problems.
+ */
+template <class T>
+struct empty_problem_init_t
+{
+  T init;
+
+  _CCCL_HOST_DEVICE operator T() const
+  {
+    return init;
+  }
+};
+
+/**
+ * @brief Applies initial value to the block aggregate and stores the result to the output iterator.
+ *
+ * @param d_out Iterator to the output aggregate
+ * @param reduction_op Binary reduction functor
+ * @param init Initial value
+ * @param block_aggregate Aggregate value computed by the block
+ */
+template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
+_CCCL_HOST_DEVICE void
+finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitT init, AccumT block_aggregate)
+{
+  *d_out = reduction_op(init, block_aggregate);
+}
+
+/**
+ * @brief Ignores initial value and stores the block aggregate to the output iterator.
+ *
+ * @param d_out Iterator to the output aggregate
+ * @param block_aggregate Aggregate value computed by the block
+ */
+template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
+_CCCL_HOST_DEVICE void
+finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t<InitT>, AccumT block_aggregate)
+{
+  *d_out = block_aggregate;
+}
+} // namespace reduce
+} // namespace detail
+
+/**
+ * @brief Reduce region kernel entry point (multi-block). Computes privatized
+ *        reductions, one per thread block.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @tparam AccumT
+ *   Accumulator type
+ *
+ * @param[in] d_in
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out
+ *   Pointer to the output aggregate
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] even_share
+ *   Even-share descriptor for mapping an equal number of tiles onto each
+ *   thread block
+ *
+ * @param[in] reduction_op
+ *   Binary reduction functor
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename AccumT,
+          typename TransformOpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel(
+  InputIteratorT d_in,
+  AccumT* d_out,
+  OffsetT num_items,
+  GridEvenShare<OffsetT> even_share,
+  ReductionOpT reduction_op,
+  TransformOpT transform_op)
+{
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                InputIteratorT,
+                AccumT*,
+                OffsetT,
+                ReductionOpT,
+                AccumT,
+                TransformOpT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  // Consume input tiles
+  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeTiles(even_share);
+
+  // Output result
+  if (threadIdx.x == 0)
+  {
+    detail::uninitialized_copy_single(d_out + blockIdx.x, block_aggregate);
+  }
+}
+
+/**
+ * @brief Reduce a single tile kernel entry point (single-block). Can be used
+ *        to aggregate privatized thread block reductions from a previous
+ *        multi-block reduction pass.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `T operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @tparam AccumT
+ *   Accumulator type
+ *
+ * @param[in] d_in
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out
+ *   Pointer to the output aggregate
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] reduction_op
+ *   Binary reduction functor
+ *
+ * @param[in] init
+ *   The initial value of the reduction
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT,
+          typename AccumT,
+          typename TransformOpT = ::cuda::std::__identity>
+CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
+  int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
+  1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
+                                       OutputIteratorT d_out,
+                                       OffsetT num_items,
+                                       ReductionOpT reduction_op,
+                                       InitT init,
+                                       TransformOpT transform_op)
+{
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+                InputIteratorT,
+                OutputIteratorT,
+                OffsetT,
+                ReductionOpT,
+                AccumT,
+                TransformOpT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  // Check if empty problem
+  if (num_items == 0)
+  {
+    if (threadIdx.x == 0)
+    {
+      *d_out = init;
+    }
+
+    return;
+  }
+
+  // Consume input tiles
+  AccumT block_aggregate =
+    AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeRange(OffsetT(0), num_items);
+
+  // Output result
+  if (threadIdx.x == 0)
+  {
+    detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate);
+  }
+}
+
+CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
index 0e1b232ff6..466c3fa978 100644
--- a/cub/test/catch2_test_nvrtc.cu
+++ b/cub/test/catch2_test_nvrtc.cu
@@ -54,6 +54,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
     "#include <cub/block/block_radix_sort.cuh>                                                   \n"
     "#include <cub/block/block_reduce.cuh>                                                       \n"
     "#include <cub/block/block_scan.cuh>                                                         \n"
+    "#include <cub/device/dispatch/kernels/reduce.cuh>                                           \n"
     "                                                                                            \n"
     "extern \"C\" __global__ void kernel(int *ptr, int *errors)                                  \n"
     "{                                                                                           \n"

From 6213a5e68a7158799834e70cf4865842107d4e5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Dominiak?= <griwes@griwes.info>
Date: Wed, 14 Aug 2024 09:45:14 +0200
Subject: [PATCH 21/33] Implement `<cuda/std/bitset>` (#1496)

---
 .../standard_api/utility_library.rst          |    4 +
 .../standard_api/utility_library/bitset.rst   |   14 +
 .../include/cuda/std/__algorithm/copy.h       |   52 +-
 .../include/cuda/std/__algorithm/copy_n.h     |    2 +-
 .../__bit_reference => __bit/reference.h}     |  863 ++++++------
 libcudacxx/include/cuda/std/__cccl/compiler.h |    3 +
 .../include/cuda/std/__cccl/diagnostic.h      |   20 +-
 libcudacxx/include/cuda/std/bitset            | 1071 ++++++++++++++
 .../cuda/std/detail/libcxx/include/__string   | 1246 +++++++++++++++++
 .../cuda/std/detail/libcxx/include/bitset     | 1027 --------------
 .../cuda/std/detail/libcxx/include/cstddef    |    2 +-
 .../bitset.cons/char_ptr_ctor.pass.cpp        |  159 +++
 .../bitset.cons/default.pass.cpp              |   64 +
 .../bitset.cons/string_ctor.pass.cpp          |  196 +++
 .../bitset.cons/string_view_ctor.pass.cpp     |  201 +++
 .../bitset.cons/ull_ctor.pass.cpp             |   72 +
 .../bitset.members/all.pass.cpp               |   55 +
 .../bitset.members/any.pass.cpp               |   58 +
 .../bitset.members/count.pass.cpp             |   69 +
 .../bitset.members/flip_all.pass.cpp          |   65 +
 .../flip_one.out_of_range.pass.cpp            |   52 +
 .../bitset.members/flip_one.pass.cpp          |   65 +
 .../bitset.members/index.pass.cpp             |   81 ++
 .../bitset.members/index_const.pass.cpp       |   72 +
 .../bitset.members/left_shift.pass.cpp        |   71 +
 .../bitset.members/left_shift_eq.pass.cpp     |   89 ++
 .../bitset.members/none.pass.cpp              |   58 +
 .../bitset.members/not_all.pass.cpp           |   64 +
 .../bitset.members/op_and_eq.pass.cpp         |   71 +
 .../bitset.members/op_eq_eq.pass.cpp          |   62 +
 .../bitset.members/op_or_eq.pass.cpp          |   78 ++
 .../bitset.members/op_xor_eq.pass.cpp         |   77 +
 .../bitset.members/reset_all.pass.cpp         |   59 +
 .../reset_one.out_of_range.pass.cpp           |   52 +
 .../bitset.members/reset_one.pass.cpp         |   75 +
 .../bitset.members/right_shift.pass.cpp       |   69 +
 .../bitset.members/right_shift_eq.pass.cpp    |   91 ++
 .../bitset.members/set_all.pass.cpp           |   58 +
 .../set_one.out_of_range.pass.cpp             |   52 +
 .../bitset.members/set_one.pass.cpp           |   60 +
 .../bitset.members/size.pass.cpp              |   46 +
 .../bitset.members/test.out_of_range.pass.cpp |   52 +
 .../bitset.members/test.pass.cpp              |   60 +
 .../bitset.members/to_string.pass.cpp         |  185 +++
 .../bitset.members/to_ullong.pass.cpp         |   75 +
 .../bitset.members/to_ulong.pass.cpp          |   74 +
 .../bitset.operators/op_and.pass.cpp          |   60 +
 .../bitset.operators/op_not.pass.cpp          |   60 +
 .../bitset.operators/op_or.pass.cpp           |   60 +
 .../bitset.operators/stream_in.pass.cpp       |  100 ++
 .../bitset.operators/stream_out.pass.cpp      |   42 +
 .../template.bitset/bitset_test_cases.h       |  163 +++
 .../template.bitset/includes.pass.cpp         |   35 +
 libcudacxx/test/support/test_macros.h         |    1 +
 54 files changed, 6112 insertions(+), 1500 deletions(-)
 create mode 100644 docs/libcudacxx/standard_api/utility_library/bitset.rst
 rename libcudacxx/include/cuda/std/{detail/libcxx/include/__bit_reference => __bit/reference.h} (56%)
 create mode 100644 libcudacxx/include/cuda/std/bitset
 create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__string
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/bitset
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h
 create mode 100644 libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp

diff --git a/docs/libcudacxx/standard_api/utility_library.rst b/docs/libcudacxx/standard_api/utility_library.rst
index 12582dc146..4df28701a3 100644
--- a/docs/libcudacxx/standard_api/utility_library.rst
+++ b/docs/libcudacxx/standard_api/utility_library.rst
@@ -7,6 +7,7 @@ Utility Library
    :hidden:
    :maxdepth: 1
 
+   utility_library/bitset
    utility_library/expected
    utility_library/functional
    utility_library/optional
@@ -26,6 +27,9 @@ the information about the individual features for details.
    * - Header
      - Content
      - Availability
+   * - :ref:`libcudacxx-standard-api-utility-bitset`
+     - Fixed-size sequence of bits
+     - CCCL 2.8.0
    * - :ref:`libcudacxx-standard-api-utility-expected`
      - Optional value with error channel
      - CCCL 2.3.0 / CUDA 12.4
diff --git a/docs/libcudacxx/standard_api/utility_library/bitset.rst b/docs/libcudacxx/standard_api/utility_library/bitset.rst
new file mode 100644
index 0000000000..a621cb01ab
--- /dev/null
+++ b/docs/libcudacxx/standard_api/utility_library/bitset.rst
@@ -0,0 +1,14 @@
+.. _libcudacxx-standard-api-utility-bitset:
+
+``<cuda/std/bitset>``
+======================
+
+Extensions
+----------
+
+-  All features of ``<bitset>`` are made constexpr in C++14 onwards
+
+Restrictions
+------------
+
+-  On device no exceptions are thrown in case of a bad access.
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h
index 03e10fe98c..883cbc4632 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy.h
@@ -27,6 +27,7 @@
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/remove_const.h>
+#include <cuda/std/detail/libcxx/include/cstdint>
 #include <cuda/std/detail/libcxx/include/cstdlib>
 #include <cuda/std/detail/libcxx/include/cstring>
 
@@ -48,6 +49,13 @@ template <class _Tp, class _Up>
 inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
 __dispatch_memmove(_Up* __result, _Tp* __first, const size_t __n)
 {
+  // This is a pessimisation, but there's no way to do the code path detection correctly before GCC 9.0.
+  // __builtin_memmove is also illegal in constexpr there, so... just always assume we are constant evaluated,
+  // and let the optimizer *maybe* recover some of the perf.
+#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900
+  return false;
+#endif
+
   if (__libcpp_is_constant_evaluated())
   {
     return false;
@@ -66,6 +74,35 @@ __dispatch_memmove(_Up* __result, _Tp* __first, const size_t __n)
   }
 }
 
+template <class _Tp, class _Up>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+__constexpr_tail_overlap_fallback(_Tp* __first, _Up* __needle, _Tp* __last)
+{
+  while (__first != __last)
+  {
+    if (__first == __needle)
+    {
+      return true;
+    }
+    ++__first;
+  }
+  return false;
+}
+
+template <class _Tp, class _Up>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+__constexpr_tail_overlap(_Tp* __first, _Up* __needle, _Tp* __last)
+{
+  _LIBCUDACXX_UNUSED_VAR(__last);
+#if __has_builtin(__builtin_constant_p) || defined(_CCCL_COMPILER_GCC)
+  NV_IF_ELSE_TARGET(NV_IS_HOST,
+                    (return __builtin_constant_p(__first < __needle) && __first < __needle;),
+                    (return __constexpr_tail_overlap_fallback(__first, __needle, __last);))
+#else
+  return __constexpr_tail_overlap_fallback(__first, __needle, __last);
+#endif
+}
+
 template <class _AlgPolicy,
           class _Tp,
           class _Up,
@@ -81,9 +118,20 @@ __copy(_Tp* __first, _Tp* __last, _Up* __result)
     {
       return {__last, __result + __n};
     }
-    for (ptrdiff_t __i = 0; __i < __n; ++__i)
+    if ((!__libcpp_is_constant_evaluated() && __first < __result)
+        || __constexpr_tail_overlap(__first, __result, __last))
+    {
+      for (ptrdiff_t __i = __n; __i > 0; --__i)
+      {
+        *(__result + __i - 1) = *(__first + __i - 1);
+      }
+    }
+    else
     {
-      *(__result + __i) = *(__first + __i);
+      for (ptrdiff_t __i = 0; __i < __n; ++__i)
+      {
+        *(__result + __i) = *(__first + __i);
+      }
     }
   }
   return {__last, __result + __n};
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy_n.h b/libcudacxx/include/cuda/std/__algorithm/copy_n.h
index a6c62e920d..eb9e28873d 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy_n.h
@@ -55,7 +55,7 @@ template <class _InputIterator,
           class _Size,
           class _OutputIterator,
           __enable_if_t<__is_cpp17_random_access_iterator<_InputIterator>::value, int> = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _OutputIterator
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
 copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 {
   using _IntegralSize = decltype(__convert_to_integral(__orig_n));
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference b/libcudacxx/include/cuda/std/__bit/reference.h
similarity index 56%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
rename to libcudacxx/include/cuda/std/__bit/reference.h
index 9c64111069..29482f7e25 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
+++ b/libcudacxx/include/cuda/std/__bit/reference.h
@@ -10,9 +10,7 @@
 #ifndef _LIBCUDACXX___BIT_REFERENCE
 #define _LIBCUDACXX___BIT_REFERENCE
 
-##include<cuda / std / detail / __config>
-#include <algorithm>
-#include <bit>
+#include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -22,27 +20,36 @@
 #  pragma system_header
 #endif // no system header
 
-  _LIBCUDACXX_PUSH_MACROS
-#include <__undef_macros>
+#include <cuda/std/__algorithm/copy_n.h>
+#include <cuda/std/__algorithm/fill_n.h>
+#include <cuda/std/__algorithm/min.h>
+// TODO: modularize bit a bit
+#include <cuda/std/bit>
+// #include <cuda/std/__bit/countr.h>
+// #include <cuda/std/__bit/invert_if.h>
+// #include <cuda/std/__bit/popcount.h>
+#include <cuda/std/__iterator/iterator_traits.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__utility/swap.h>
+#include <cuda/std/detail/libcxx/include/cstring>
+
+_CCCL_PUSH_MACROS
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-  _LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _Cp, bool _IsConst, typename _Cp::__storage_type = 0>
-class __bit_iterator;
 template <class _Cp>
 class __bit_const_reference;
 
-template <class _Tp>
-struct __has_storage_type
-{
-  static const bool value = false;
-};
+template <class _Cp, bool _IsConst>
+class __bit_iterator;
 
-template <class _Cp, bool = __has_storage_type<_Cp>::value>
+template <class _Cp>
 class __bit_reference
 {
-  typedef typename _Cp::__storage_type __storage_type;
-  typedef typename _Cp::__storage_pointer __storage_pointer;
+  using __storage_type    = typename _Cp::__storage_type;
+  using __storage_pointer = typename _Cp::__storage_pointer;
 
   __storage_pointer __seg_;
   __storage_type __mask_;
@@ -53,18 +60,21 @@ class __bit_reference
   friend class __bit_iterator<_Cp, false>;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(const __bit_reference&) = default;
+  using __container = typename _Cp::__self;
+
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_reference(const __bit_reference&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 operator bool() const noexcept
   {
     return static_cast<bool>(*__seg_ & __mask_);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool operator~() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator~() const noexcept
   {
     return !static_cast<bool>(*this);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(bool __x) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_reference&
+  operator=(bool __x) noexcept
   {
     if (__x)
     {
@@ -77,68 +87,84 @@ class __bit_reference
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(const __bit_reference& __x) noexcept
+#if _CCCL_STD_VER >= 2023
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const __bit_reference&
+  operator=(bool __x) const noexcept
+  {
+    if (__x)
+    {
+      *__seg_ |= __mask_;
+    }
+    else
+    {
+      *__seg_ &= ~__mask_;
+    }
+    return *this;
+  }
+#endif // C++23+
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_reference&
+  operator=(const __bit_reference& __x) noexcept
   {
     return operator=(static_cast<bool>(__x));
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept
   {
     *__seg_ ^= __mask_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> operator&() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
+  operator&() const noexcept
+  {
+    return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__mask_)));
+  }
+
+  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept
+  {
+    bool __t = __x;
+    __x      = __y;
+    __y      = __t;
+  }
+
+  template <class _Dp>
+  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept
+  {
+    bool __t = __x;
+    __x      = __y;
+    __y      = __t;
+  }
+
+  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  swap(__bit_reference<_Cp> __x, bool& __y) noexcept
+  {
+    bool __t = __x;
+    __x      = __y;
+    __y      = __t;
+  }
+
+  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  swap(bool& __x, __bit_reference<_Cp> __y) noexcept
   {
-    return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));
+    bool __t = __x;
+    __x      = __y;
+    __y      = __t;
   }
 
 private:
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(__storage_pointer __s, __storage_type __m) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _CCCL_CONSTEXPR_CXX14 explicit __bit_reference(__storage_pointer __s, __storage_type __m) noexcept
       : __seg_(__s)
       , __mask_(__m)
   {}
 };
 
-template <class _Cp>
-class __bit_reference<_Cp, false>
-{};
-
-template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept
-{
-  bool __t = __x;
-  __x      = __y;
-  __y      = __t;
-}
-
-template <class _Cp, class _Dp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept
-{
-  bool __t = __x;
-  __x      = __y;
-  __y      = __t;
-}
-
-template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, bool& __y) noexcept
-{
-  bool __t = __x;
-  __x      = __y;
-  __y      = __t;
-}
-
-template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void swap(bool& __x, __bit_reference<_Cp> __y) noexcept
-{
-  bool __t = __x;
-  __x      = __y;
-  __y      = __t;
-}
-
 template <class _Cp>
 class __bit_const_reference
 {
-  typedef typename _Cp::__storage_type __storage_type;
-  typedef typename _Cp::__const_storage_pointer __storage_pointer;
+  using __storage_type    = typename _Cp::__storage_type;
+  using __storage_pointer = typename _Cp::__const_storage_pointer;
 
   __storage_pointer __seg_;
   __storage_type __mask_;
@@ -147,25 +173,30 @@ class __bit_const_reference
   friend class __bit_iterator<_Cp, true>;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_const_reference&) = default;
+  using __container = typename _Cp::__self;
+
+  _LIBCUDACXX_HIDE_FROM_ABI __bit_const_reference(const __bit_const_reference&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
       : __seg_(__x.__seg_)
       , __mask_(__x.__mask_)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept
   {
     return static_cast<bool>(*__seg_ & __mask_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, true> operator&() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, true>
+  operator&() const noexcept
   {
-    return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));
+    return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__mask_)));
   }
 
 private:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bit_const_reference(__storage_pointer __s, __storage_type __m) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __bit_const_reference(
+    __storage_pointer __s, __storage_type __m) noexcept
       : __seg_(__s)
       , __mask_(__m)
   {}
@@ -173,262 +204,66 @@ class __bit_const_reference
   __bit_const_reference& operator=(const __bit_const_reference&) = delete;
 };
 
-// find
+// fill_n
 
-template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, _IsConst> __find_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
+template <bool _FillVal, class _Cp>
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void
+__fill_n_impl(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
 {
-  typedef __bit_iterator<_Cp, _IsConst> _It;
-  typedef typename _It::__storage_type __storage_type;
-  static const int __bits_per_word = _It::__bits_per_word;
-  // do first partial word
-  if (__first.__ctz_ != 0)
-  {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    __storage_type __b     = *__first.__seg_ & __m;
-    if (__b)
-    {
-      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
-    }
-    if (__n == __dn)
-    {
-      return __first + __n;
-    }
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-  {
-    if (*__first.__seg_)
-    {
-      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(*__first.__seg_)));
-    }
-  }
-  // do last partial word
-  if (__n > 0)
-  {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __storage_type __b = *__first.__seg_ & __m;
-    if (__b)
-    {
-      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
-    }
-  }
-  return _It(__first.__seg_, static_cast<unsigned>(__n));
-}
+  using _It            = __bit_iterator<_Cp, false>;
+  using __storage_type = typename _It::__storage_type;
 
-template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, _IsConst> __find_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
-{
-  typedef __bit_iterator<_Cp, _IsConst> _It;
-  typedef typename _It::__storage_type __storage_type;
   const int __bits_per_word = _It::__bits_per_word;
   // do first partial word
   if (__first.__ctz_ != 0)
   {
     __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __dn    = (_CUDA_VSTD::min)(__clz_f, static_cast<__storage_type>(__n));
     __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    __storage_type __b     = ~*__first.__seg_ & __m;
-    if (__b)
+    if (_FillVal)
     {
-      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      *__first.__seg_ |= __m;
     }
-    if (__n == __dn)
+    else
     {
-      return __first + __n;
+      *__first.__seg_ &= ~__m;
     }
-    __n -= __dn;
+    __n -= __dn.__data;
     ++__first.__seg_;
   }
   // do middle whole words
-  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-  {
-    __storage_type __b = ~*__first.__seg_;
-    if (__b)
-    {
-      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
-    }
-  }
+  __storage_type __nw = __n / __bits_per_word;
+  _CUDA_VSTD::fill_n(_CUDA_VSTD::__to_address(__first.__seg_), __nw, _FillVal ? ~static_cast<__storage_type>(0) : 0);
+  __n -= (__nw * __bits_per_word).__data;
   // do last partial word
   if (__n > 0)
   {
+    __first.__seg_ += __nw.__data;
     __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __storage_type __b = ~*__first.__seg_ & __m;
-    if (__b)
+    if (_FillVal)
     {
-      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      *__first.__seg_ |= __m;
+    }
+    else
+    {
+      *__first.__seg_ &= ~__m;
     }
-  }
-  return _It(__first.__seg_, static_cast<unsigned>(__n));
-}
-
-template <class _Cp, bool _IsConst, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, _IsConst>
-find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
-{
-  if (static_cast<bool>(__value_))
-  {
-    return __find_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-  }
-  return __find_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
-}
-
-// count
-
-template <class _Cp, bool _IsConst>
-typename __bit_iterator<_Cp, _IsConst>::difference_type
-__count_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
-{
-  typedef __bit_iterator<_Cp, _IsConst> _It;
-  typedef typename _It::__storage_type __storage_type;
-  typedef typename _It::difference_type difference_type;
-  const int __bits_per_word = _It::__bits_per_word;
-  difference_type __r       = 0;
-  // do first partial word
-  if (__first.__ctz_ != 0)
-  {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    __r                    = _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-  {
-    __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_);
-  }
-  // do last partial word
-  if (__n > 0)
-  {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
-  }
-  return __r;
-}
-
-template <class _Cp, bool _IsConst>
-typename __bit_iterator<_Cp, _IsConst>::difference_type
-__count_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
-{
-  typedef __bit_iterator<_Cp, _IsConst> _It;
-  typedef typename _It::__storage_type __storage_type;
-  typedef typename _It::difference_type difference_type;
-  const int __bits_per_word = _It::__bits_per_word;
-  difference_type __r       = 0;
-  // do first partial word
-  if (__first.__ctz_ != 0)
-  {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    __r                    = _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-  {
-    __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_);
-  }
-  // do last partial word
-  if (__n > 0)
-  {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
-  }
-  return __r;
-}
-
-template <class _Cp, bool _IsConst, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename __bit_iterator<_Cp, _IsConst>::difference_type
-count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
-{
-  if (static_cast<bool>(__value_))
-  {
-    return __count_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-  }
-  return __count_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
-}
-
-// fill_n
-
-template <class _Cp>
-void __fill_n_false(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
-{
-  typedef __bit_iterator<_Cp, false> _It;
-  typedef typename _It::__storage_type __storage_type;
-  const int __bits_per_word = _It::__bits_per_word;
-  // do first partial word
-  if (__first.__ctz_ != 0)
-  {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    *__first.__seg_ &= ~__m;
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  __storage_type __nw = __n / __bits_per_word;
-  _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), 0, __nw * sizeof(__storage_type));
-  __n -= __nw * __bits_per_word;
-  // do last partial word
-  if (__n > 0)
-  {
-    __first.__seg_ += __nw;
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    *__first.__seg_ &= ~__m;
-  }
-}
-
-template <class _Cp>
-void __fill_n_true(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
-{
-  typedef __bit_iterator<_Cp, false> _It;
-  typedef typename _It::__storage_type __storage_type;
-  const int __bits_per_word = _It::__bits_per_word;
-  // do first partial word
-  if (__first.__ctz_ != 0)
-  {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    *__first.__seg_ |= __m;
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  __storage_type __nw = __n / __bits_per_word;
-  _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), -1, __nw * sizeof(__storage_type));
-  __n -= __nw * __bits_per_word;
-  // do last partial word
-  if (__n > 0)
-  {
-    __first.__seg_ += __nw;
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    *__first.__seg_ |= __m;
   }
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value_)
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value)
 {
   if (__n > 0)
   {
-    if (__value_)
+    if (__value)
     {
-      __fill_n_true(__first, __n);
+      _CUDA_VSTD::__fill_n_impl<true>(__first, __n);
     }
     else
     {
-      __fill_n_false(__first, __n);
+      _CUDA_VSTD::__fill_n_impl<false>(__first, __n);
     }
   }
 }
@@ -436,21 +271,24 @@ fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __v
 // fill
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value_)
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value)
 {
-  _CUDA_VSTD::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value_);
+  _CUDA_VSTD::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value);
 }
 
 // copy
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false> __copy_aligned(
-  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+__copy_aligned(__bit_iterator<_Cp, _IsConst> __first,
+               __bit_iterator<_Cp, _IsConst> __last,
+               __bit_iterator<_Cp, false> __result)
 {
-  typedef __bit_iterator<_Cp, _IsConst> _In;
-  typedef typename _In::difference_type difference_type;
-  typedef typename _In::__storage_type __storage_type;
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
   const int __bits_per_word = _In::__bits_per_word;
   difference_type __n       = __last - __first;
   if (__n > 0)
@@ -473,15 +311,13 @@ __bit_iterator<_Cp, false> __copy_aligned(
     // __first.__ctz_ == 0;
     // do middle words
     __storage_type __nw = __n / __bits_per_word;
-    _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
-                        _CUDA_VSTD::__to_raw_pointer(__first.__seg_),
-                        __nw * sizeof(__storage_type));
-    __n -= __nw * __bits_per_word;
-    __result.__seg_ += __nw;
+    _CUDA_VSTD::copy_n(_CUDA_VSTD::__to_address(__first.__seg_), __nw.__data, _CUDA_VSTD::__to_address(__result.__seg_));
+    __result.__seg_ += __nw.__data;
+    __n -= (__nw * __bits_per_word).__data;
     // do last word
     if (__n > 0)
     {
-      __first.__seg_ += __nw;
+      __first.__seg_ += __nw.__data;
       __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
@@ -493,14 +329,16 @@ __bit_iterator<_Cp, false> __copy_aligned(
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false> __copy_unaligned(
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+__copy_unaligned(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-  typedef __bit_iterator<_Cp, _IsConst> _In;
-  typedef typename _In::difference_type difference_type;
-  typedef typename _In::__storage_type __storage_type;
-  static const int __bits_per_word = _In::__bits_per_word;
-  difference_type __n              = __last - __first;
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
   if (__n > 0)
   {
     // do first word
@@ -523,9 +361,9 @@ __bit_iterator<_Cp, false> __copy_unaligned(
       {
         *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
       }
-      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
-      __dn -= __ddn;
+      __result.__seg_ += ((__ddn + __result.__ctz_) / __bits_per_word).__data;
+      __result.__ctz_ = static_cast<unsigned>(((__ddn + __result.__ctz_) % __bits_per_word).__data);
+      __dn -= __ddn.__data;
       if (__dn > 0)
       {
         __m = ~__storage_type(0) >> (__bits_per_word - __dn);
@@ -558,9 +396,9 @@ __bit_iterator<_Cp, false> __copy_unaligned(
       __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b << __result.__ctz_;
-      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
-      __n -= __dn;
+      __result.__seg_ += ((__dn + __result.__ctz_) / __bits_per_word).__data;
+      __result.__ctz_ = static_cast<unsigned>(((__dn + __result.__ctz_) % __bits_per_word).__data);
+      __n -= __dn.__data;
       if (__n > 0)
       {
         __m = ~__storage_type(0) >> (__bits_per_word - __n);
@@ -574,25 +412,27 @@ __bit_iterator<_Cp, false> __copy_unaligned(
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
 copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   if (__first.__ctz_ == __result.__ctz_)
   {
-    return __copy_aligned(__first, __last, __result);
+    return _CUDA_VSTD::__copy_aligned(__first, __last, __result);
   }
-  return __copy_unaligned(__first, __last, __result);
+  return _CUDA_VSTD::__copy_unaligned(__first, __last, __result);
 }
 
 // copy_backward
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false> __copy_backward_aligned(
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+__copy_backward_aligned(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-  typedef __bit_iterator<_Cp, _IsConst> _In;
-  typedef typename _In::difference_type difference_type;
-  typedef typename _In::__storage_type __storage_type;
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
   const int __bits_per_word = _In::__bits_per_word;
   difference_type __n       = __last - __first;
   if (__n > 0)
@@ -614,18 +454,44 @@ __bit_iterator<_Cp, false> __copy_backward_aligned(
     // __result.__ctz_ == 0 || __n == 0
     // do middle words
     __storage_type __nw = __n / __bits_per_word;
-    __result.__seg_ -= __nw;
-    __last.__seg_ -= __nw;
-    _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
-                        _CUDA_VSTD::__to_raw_pointer(__last.__seg_),
-                        __nw * sizeof(__storage_type));
-    __n -= __nw * __bits_per_word;
+    __result.__seg_ -= __nw.__data;
+    __last.__seg_ -= __nw.__data;
+    _CUDA_VSTD::copy_n(_CUDA_VSTD::__to_address(__last.__seg_), __nw.__data, _CUDA_VSTD::__to_address(__result.__seg_));
+    __n -= (__nw * __bits_per_word).__data;
     // do last word
     if (__n > 0)
     {
       __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
-      __storage_type __b = *--__last.__seg_ & __m;
-      *--__result.__seg_ &= ~__m;
+#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900
+      // workaround for GCC pre-9 being really bad at tracking one-past-the-end pointers at constexpr
+      // can't check for is-constant-evaluated, because GCC pre-9 also lacks _that_.
+      if (__last.__seg_ == __first.__seg_ + 1)
+      {
+        __last.__seg_ = __first.__seg_;
+      }
+      else
+      {
+        --__last.__seg_;
+      }
+#else // ^^ GCC < 9 ^^ | vv !GCC || GCC >= 9 vv
+      --__last.__seg_;
+#endif // !GCC || GCC >= 9
+      __storage_type __b = *__last.__seg_ & __m;
+#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900
+      // workaround for GCC pre-9 being really bad at tracking one-past-the-end pointers at constexpr
+      // can't check for is-constant-evaluated, because GCC pre-9 also lacks _that_.
+      if (__result.__seg_ == __first.__seg_ + 1)
+      {
+        __result.__seg_ = __first.__seg_;
+      }
+      else
+      {
+        --__result.__seg_;
+      }
+#else // ^^ GCC < 9 ^^ | vv !GCC || GCC >= 9 vv
+      --__result.__seg_;
+#endif // !GCC || GCC >= 9
+      *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
       __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
     }
@@ -634,12 +500,14 @@ __bit_iterator<_Cp, false> __copy_backward_aligned(
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false> __copy_backward_unaligned(
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+__copy_backward_unaligned(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-  typedef __bit_iterator<_Cp, _IsConst> _In;
-  typedef typename _In::difference_type difference_type;
-  typedef typename _In::__storage_type __storage_type;
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
   const int __bits_per_word = _In::__bits_per_word;
   difference_type __n       = __last - __first;
   if (__n > 0)
@@ -666,17 +534,37 @@ __bit_iterator<_Cp, false> __copy_backward_unaligned(
         {
           *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
         }
-        __result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
-        __dn -= __ddn;
+        _CCCL_DIAG_PUSH
+        _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type
+        __result.__ctz_ =
+          static_cast<unsigned>((((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word).__data);
+        _CCCL_DIAG_POP
+        __dn -= __ddn.__data;
       }
       if (__dn > 0)
       {
         // __result.__ctz_ == 0
+#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER < 900
+        // workaround for GCC pre-9 being really bad at tracking one-past-the-end pointers at constexpr
+        // can't check for is-constant-evaluated, because GCC pre-9 also lacks _that_.
+        if (__result.__seg_ == __first.__seg_ + 1)
+        {
+          __result.__seg_ = __first.__seg_;
+        }
+        else
+        {
+          --__result.__seg_;
+        }
+#else // ^^ GCC < 9 ^^ | vv !GCC || GCC >= 9 vv
         --__result.__seg_;
+#endif // !GCC || GCC >= 9
+        _CCCL_DIAG_PUSH
+        _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type
         __result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
-        __m             = ~__storage_type(0) << __result.__ctz_;
+        _CCCL_DIAG_POP
+        __m = ~__storage_type(0) << __result.__ctz_;
         *__result.__seg_ &= ~__m;
-        __last.__ctz_ -= __dn + __ddn;
+        __last.__ctz_ -= (__dn + __ddn).__data;
         *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
       }
       // __last.__ctz_ = 0
@@ -704,8 +592,12 @@ __bit_iterator<_Cp, false> __copy_backward_unaligned(
       __m                 = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
-      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
-      __n -= __dn;
+      _CCCL_DIAG_PUSH
+      _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type
+      __result.__ctz_ =
+        static_cast<unsigned>((((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word).__data);
+      _CCCL_DIAG_POP
+      __n -= __dn.__data;
       if (__n > 0)
       {
         // __result.__ctz_ == 0
@@ -721,20 +613,22 @@ __bit_iterator<_Cp, false> __copy_backward_unaligned(
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> copy_backward(
-  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
+copy_backward(__bit_iterator<_Cp, _IsConst> __first,
+              __bit_iterator<_Cp, _IsConst> __last,
+              __bit_iterator<_Cp, false> __result)
 {
   if (__last.__ctz_ == __result.__ctz_)
   {
-    return __copy_backward_aligned(__first, __last, __result);
+    return _CUDA_VSTD::__copy_backward_aligned(__first, __last, __result);
   }
-  return __copy_backward_unaligned(__first, __last, __result);
+  return _CUDA_VSTD::__copy_backward_unaligned(__first, __last, __result);
 }
 
 // move
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
 move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   return _CUDA_VSTD::copy(__first, __last, __result);
@@ -743,7 +637,7 @@ move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last
 // move_backward
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward(
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   return _CUDA_VSTD::copy_backward(__first, __last, __result);
@@ -751,13 +645,14 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward(
 
 // swap_ranges
 
-template <class __C1, class __C2>
-__bit_iterator<__C2, false> __swap_ranges_aligned(
-  __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result)
+template <class _Cl, class _Cr>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> __swap_ranges_aligned(
+  __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result)
 {
-  typedef __bit_iterator<__C1, false> _I1;
-  typedef typename _I1::difference_type difference_type;
-  typedef typename _I1::__storage_type __storage_type;
+  using _I1             = __bit_iterator<_Cl, false>;
+  using difference_type = typename _I1::difference_type;
+  using __storage_type  = typename _I1::__storage_type;
+
   const int __bits_per_word = _I1::__bits_per_word;
   difference_type __n       = __last - __first;
   if (__n > 0)
@@ -802,13 +697,14 @@ __bit_iterator<__C2, false> __swap_ranges_aligned(
   return __result;
 }
 
-template <class __C1, class __C2>
-__bit_iterator<__C2, false> __swap_ranges_unaligned(
-  __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result)
+template <class _Cl, class _Cr>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> __swap_ranges_unaligned(
+  __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result)
 {
-  typedef __bit_iterator<__C1, false> _I1;
-  typedef typename _I1::difference_type difference_type;
-  typedef typename _I1::__storage_type __storage_type;
+  using _I1             = __bit_iterator<_Cl, false>;
+  using difference_type = typename _I1::difference_type;
+  using __storage_type  = typename _I1::__storage_type;
+
   const int __bits_per_word = _I1::__bits_per_word;
   difference_type __n       = __last - __first;
   if (__n > 0)
@@ -901,15 +797,15 @@ __bit_iterator<__C2, false> __swap_ranges_unaligned(
   return __result;
 }
 
-template <class __C1, class __C2>
-inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<__C2, false> swap_ranges(
-  __bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __last1, __bit_iterator<__C2, false> __first2)
+template <class _Cl, class _Cr>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> swap_ranges(
+  __bit_iterator<_Cl, false> __first1, __bit_iterator<_Cl, false> __last1, __bit_iterator<_Cr, false> __first2)
 {
   if (__first1.__ctz_ == __first2.__ctz_)
   {
-    return __swap_ranges_aligned(__first1, __last1, __first2);
+    return _CUDA_VSTD::__swap_ranges_aligned(__first1, __last1, __first2);
   }
-  return __swap_ranges_unaligned(__first1, __last1, __first2);
+  return _CUDA_VSTD::__swap_ranges_unaligned(__first1, __last1, __first2);
 }
 
 // rotate
@@ -917,28 +813,38 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<__C2, false> swap_ranges(
 template <class _Cp>
 struct __bit_array
 {
-  typedef typename _Cp::difference_type difference_type;
-  typedef typename _Cp::__storage_type __storage_type;
-  typedef typename _Cp::__storage_pointer __storage_pointer;
-  typedef typename _Cp::iterator iterator;
+  using difference_type   = typename _Cp::difference_type;
+  using __storage_type    = typename _Cp::__storage_type;
+  using __storage_pointer = typename _Cp::__storage_pointer;
+  using iterator          = typename _Cp::iterator;
+
   static const unsigned __bits_per_word = _Cp::__bits_per_word;
   static const unsigned _Np             = 4;
 
   difference_type __size_;
   __storage_type __word_[_Np];
 
-  _LIBCUDACXX_INLINE_VISIBILITY static difference_type capacity()
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static difference_type capacity()
   {
     return static_cast<difference_type>(_Np * __bits_per_word);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __bit_array(difference_type __s)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _CCCL_CONSTEXPR_CXX14 explicit __bit_array(difference_type __s)
       : __size_(__s)
-  {}
-  _LIBCUDACXX_INLINE_VISIBILITY iterator begin()
+  {
+    if (__libcpp_is_constant_evaluated())
+    {
+      for (size_t __i = 0; __i != __bit_array<_Cp>::_Np; ++__i)
+      {
+        _CUDA_VSTD::__construct_at(__word_ + __i, 0);
+      }
+    }
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator begin()
   {
     return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY iterator end()
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator end()
   {
     return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word,
                     static_cast<unsigned>(__size_ % __bits_per_word));
@@ -946,11 +852,12 @@ struct __bit_array
 };
 
 template <class _Cp>
-__bit_iterator<_Cp, false>
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
 rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, __bit_iterator<_Cp, false> __last)
 {
-  typedef __bit_iterator<_Cp, false> _I1;
-  typedef typename _I1::difference_type difference_type;
+  using _I1             = __bit_iterator<_Cp, false>;
+  using difference_type = typename _I1::difference_type;
+
   difference_type __d1 = __middle - __first;
   difference_type __d2 = __last - __middle;
   _I1 __r              = __first + __d2;
@@ -997,14 +904,15 @@ rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle,
 // equal
 
 template <class _Cp, bool _IC1, bool _IC2>
-bool __equal_unaligned(
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY bool __equal_unaligned(
   __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-  typedef __bit_iterator<_Cp, _IC1> _It;
-  typedef typename _It::difference_type difference_type;
-  typedef typename _It::__storage_type __storage_type;
-  static const int __bits_per_word = _It::__bits_per_word;
-  difference_type __n              = __last1 - __first1;
+  using _It             = __bit_iterator<_Cp, _IC1>;
+  using difference_type = typename _It::difference_type;
+  using __storage_type  = typename _It::__storage_type;
+
+  const int __bits_per_word = _It::__bits_per_word;
+  difference_type __n       = __last1 - __first1;
   if (__n > 0)
   {
     // do first word
@@ -1032,9 +940,9 @@ bool __equal_unaligned(
           return false;
         }
       }
-      __first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word;
-      __first2.__ctz_ = static_cast<unsigned>((__ddn + __first2.__ctz_) % __bits_per_word);
-      __dn -= __ddn;
+      __first2.__seg_ += ((__ddn + __first2.__ctz_) / __bits_per_word).__data;
+      __first2.__ctz_ = static_cast<unsigned>(((__ddn + __first2.__ctz_) % __bits_per_word).__data);
+      __dn -= __ddn.__data;
       if (__dn > 0)
       {
         __m = ~__storage_type(0) >> (__bits_per_word - __dn);
@@ -1075,9 +983,9 @@ bool __equal_unaligned(
       {
         return false;
       }
-      __first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word;
-      __first2.__ctz_ = static_cast<unsigned>((__dn + __first2.__ctz_) % __bits_per_word);
-      __n -= __dn;
+      __first2.__seg_ += ((__dn + __first2.__ctz_) / __bits_per_word).__data;
+      __first2.__ctz_ = static_cast<unsigned>(((__dn + __first2.__ctz_) % __bits_per_word).__data);
+      __n -= __dn.__data;
       if (__n > 0)
       {
         __m = ~__storage_type(0) >> (__bits_per_word - __n);
@@ -1092,14 +1000,15 @@ bool __equal_unaligned(
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-bool __equal_aligned(
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY bool __equal_aligned(
   __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-  typedef __bit_iterator<_Cp, _IC1> _It;
-  typedef typename _It::difference_type difference_type;
-  typedef typename _It::__storage_type __storage_type;
-  static const int __bits_per_word = _It::__bits_per_word;
-  difference_type __n              = __last1 - __first1;
+  using _It             = __bit_iterator<_Cp, _IC1>;
+  using difference_type = typename _It::difference_type;
+  using __storage_type  = typename _It::__storage_type;
+
+  const int __bits_per_word = _It::__bits_per_word;
+  difference_type __n       = __last1 - __first1;
   if (__n > 0)
   {
     // do first word
@@ -1142,56 +1051,57 @@ bool __equal_aligned(
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
 equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
   if (__first1.__ctz_ == __first2.__ctz_)
   {
-    return __equal_aligned(__first1, __last1, __first2);
+    return _CUDA_VSTD::__equal_aligned(__first1, __last1, __first2);
   }
-  return __equal_unaligned(__first1, __last1, __first2);
+  return _CUDA_VSTD::__equal_unaligned(__first1, __last1, __first2);
 }
 
-template <class _Cp, bool _IsConst, typename _Cp::__storage_type>
+template <class _Cp, bool _IsConst>
 class __bit_iterator
 {
 public:
-  typedef typename _Cp::difference_type difference_type;
-  typedef bool value_type;
-  typedef __bit_iterator pointer;
-  typedef typename conditional<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp>>::type reference;
-  typedef random_access_iterator_tag iterator_category;
+  using difference_type   = typename _Cp::difference_type;
+  using value_type        = bool;
+  using pointer           = __bit_iterator;
+  using reference         = __conditional_t<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp>>;
+  using iterator_category = random_access_iterator_tag;
 
 private:
-  typedef typename _Cp::__storage_type __storage_type;
-  typedef typename conditional<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>::type
-    __storage_pointer;
+  using __storage_type = typename _Cp::__storage_type;
+  using __storage_pointer =
+    __conditional_t<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>;
+
   static const unsigned __bits_per_word = _Cp::__bits_per_word;
 
   __storage_pointer __seg_;
   unsigned __ctz_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator() noexcept
-#if _CCCL_STD_VER > 2011
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator() noexcept
       : __seg_(nullptr)
       , __ctz_(0)
-#endif
   {}
-  // avoid re-declaring a copy constructor for the non-const version.
-  using __type_for_copy_to_const = _If<_IsConst, __bit_iterator<_Cp, false>, struct __private_nat>;
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(const __type_for_copy_to_const& __it) noexcept
+  _CCCL_CONSTEXPR_CXX14 __bit_iterator(const __bit_iterator<_Cp, _IsConst>& __it) = default;
+
+  template <bool _OtherIsConst, class = __enable_if_t<_IsConst == true && _OtherIsConst == false>>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  __bit_iterator(const __bit_iterator<_Cp, _OtherIsConst>& __it) noexcept
       : __seg_(__it.__seg_)
       , __ctz_(__it.__ctz_)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY reference operator*() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept
   {
     return reference(__seg_, __storage_type(1) << __ctz_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator++()
   {
     if (__ctz_ != __bits_per_word - 1)
     {
@@ -1205,14 +1115,14 @@ class __bit_iterator
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator operator++(int)
   {
     __bit_iterator __tmp = *this;
     ++(*this);
     return __tmp;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator--()
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator--()
   {
     if (__ctz_ != 0)
     {
@@ -1226,14 +1136,15 @@ class __bit_iterator
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator--(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator operator--(int)
   {
     __bit_iterator __tmp = *this;
     --(*this);
     return __tmp;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator+=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator&
+  operator+=(difference_type __n)
   {
     if (__n >= 0)
     {
@@ -1249,72 +1160,91 @@ class __bit_iterator
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator-=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator&
+  operator-=(difference_type __n)
   {
     return *this += -__n;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator+(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator
+  operator+(difference_type __n) const
   {
     __bit_iterator __t(*this);
     __t += __n;
     return __t;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator-(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator
+  operator-(difference_type __n) const
   {
     __bit_iterator __t(*this);
     __t -= __n;
     return __t;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator operator+(difference_type __n, const __bit_iterator& __it)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend __bit_iterator
+  operator+(difference_type __n, const __bit_iterator& __it)
   {
     return __it + __n;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend difference_type operator-(const __bit_iterator& __x, const __bit_iterator& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend difference_type
+  operator-(const __bit_iterator& __x, const __bit_iterator& __y)
   {
+#if defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 800 && _GNUC_VER < 900
+    if (__y.__seg_ && __y.__seg_ != __x.__seg_)
+    {
+      return (__x.__seg_ == __y.__seg_ + 1 ? 1 : __x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_;
+    }
+#endif // GCC [8, 9)
     return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY reference operator[](difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
+  operator[](difference_type __n) const
   {
     return *(*this + __n);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const __bit_iterator& __x, const __bit_iterator& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  operator==(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator!=(const __bit_iterator& __x, const __bit_iterator& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  operator!=(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return !(__x == __y);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<(const __bit_iterator& __x, const __bit_iterator& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  operator<(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>(const __bit_iterator& __x, const __bit_iterator& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  operator>(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return __y < __x;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<=(const __bit_iterator& __x, const __bit_iterator& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  operator<=(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return !(__y < __x);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>=(const __bit_iterator& __x, const __bit_iterator& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  operator>=(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return !(__x < __y);
   }
 
 private:
-  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _CCCL_CONSTEXPR_CXX14 explicit __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept
       : __seg_(__s)
       , __ctz_(__ctz)
   {}
@@ -1326,56 +1256,55 @@ class __bit_iterator
   friend class __bit_iterator<_Cp, true>;
   template <class _Dp>
   friend struct __bit_array;
-  template <class _Dp>
-  friend void __fill_n_false(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
-  template <class _Dp>
-  friend void __fill_n_true(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  template <bool _FillVal, class _Dp>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend void
+  __fill_n_impl(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+
   template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, false> __copy_aligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_aligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, false> __copy_unaligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_unaligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, false>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false>
   copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, false> __copy_backward_aligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_backward_aligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, false>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false>
   copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
-  template <class __C1, class __C2>
-  friend __bit_iterator<__C2, false>
-    __swap_ranges_aligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
-  template <class __C1, class __C2>
-  friend __bit_iterator<__C2, false>
-    __swap_ranges_unaligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
-  template <class __C1, class __C2>
-  friend __bit_iterator<__C2, false>
-    swap_ranges(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
+  template <class _Cl, class _Cr>
+  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false>
+    __swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
+  template <class _Cl, class _Cr>
+  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false>
+    __swap_ranges_unaligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
+  template <class _Cl, class _Cr>
+  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false>
+    swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
   template <class _Dp>
-  friend __bit_iterator<_Dp, false>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false>
     rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>);
   template <class _Dp, bool _IC1, bool _IC2>
-  friend bool __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool
+    __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
   template <class _Dp, bool _IC1, bool _IC2>
-  friend bool __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool
+    __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
   template <class _Dp, bool _IC1, bool _IC2>
-  friend bool equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
-  template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, _IC> __find_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
-  template <class _Dp, bool _IC>
-  friend __bit_iterator<_Dp, _IC> __find_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
-  template <class _Dp, bool _IC>
-  friend typename __bit_iterator<_Dp, _IC>::difference_type
-    __count_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
-  template <class _Dp, bool _IC>
-  friend typename __bit_iterator<_Dp, _IC>::difference_type
-    __count_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool
+    equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  template <bool _ToFind, class _Dp, bool _IC>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, _IC>
+    __find_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  template <bool _ToCount, class _Dp, bool _IC>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend
+    typename __bit_iterator<_Dp, _IC>::difference_type __count_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
 };
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__cccl/compiler.h b/libcudacxx/include/cuda/std/__cccl/compiler.h
index d4d68a86f2..c3b09f1872 100644
--- a/libcudacxx/include/cuda/std/__cccl/compiler.h
+++ b/libcudacxx/include/cuda/std/__cccl/compiler.h
@@ -78,6 +78,9 @@
 #if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1103000)
 #  define _CCCL_CUDACC_BELOW_11_3
 #endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1103000
+#if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1104000)
+#  define _CCCL_CUDACC_BELOW_11_4
+#endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1104000
 #if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1108000)
 #  define _CCCL_CUDACC_BELOW_11_8
 #endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1108000
diff --git a/libcudacxx/include/cuda/std/__cccl/diagnostic.h b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
index 3ffdefd173..64f27049fe 100644
--- a/libcudacxx/include/cuda/std/__cccl/diagnostic.h
+++ b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
@@ -32,13 +32,23 @@
 #  define _CCCL_DIAG_SUPPRESS_GCC(str)
 #  define _CCCL_DIAG_SUPPRESS_NVHPC(str)
 #  define _CCCL_DIAG_SUPPRESS_MSVC(str)
-#elif defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_ICC)
+#  define _CCCL_DIAG_SUPPRESS_ICC(str)
+#elif defined(_CCCL_COMPILER_GCC)
+#  define _CCCL_DIAG_PUSH _Pragma("GCC diagnostic push")
+#  define _CCCL_DIAG_POP  _Pragma("GCC diagnostic pop")
+#  define _CCCL_DIAG_SUPPRESS_CLANG(str)
+#  define _CCCL_DIAG_SUPPRESS_GCC(str) _Pragma(_CCCL_TOSTRING(GCC diagnostic ignored str))
+#  define _CCCL_DIAG_SUPPRESS_NVHPC(str)
+#  define _CCCL_DIAG_SUPPRESS_MSVC(str)
+#  define _CCCL_DIAG_SUPPRESS_ICC(str)
+#elif defined(_CCCL_COMPILER_ICC)
 #  define _CCCL_DIAG_PUSH _Pragma("GCC diagnostic push")
 #  define _CCCL_DIAG_POP  _Pragma("GCC diagnostic pop")
 #  define _CCCL_DIAG_SUPPRESS_CLANG(str)
 #  define _CCCL_DIAG_SUPPRESS_GCC(str) _Pragma(_CCCL_TOSTRING(GCC diagnostic ignored str))
 #  define _CCCL_DIAG_SUPPRESS_NVHPC(str)
 #  define _CCCL_DIAG_SUPPRESS_MSVC(str)
+#  define _CCCL_DIAG_SUPPRESS_ICC(str) _Pragma(_CCCL_TOSTRING(warning disable str))
 #elif defined(_CCCL_COMPILER_NVHPC)
 #  define _CCCL_DIAG_PUSH _Pragma("diagnostic push")
 #  define _CCCL_DIAG_POP  _Pragma("diagnostic pop")
@@ -46,6 +56,7 @@
 #  define _CCCL_DIAG_SUPPRESS_GCC(str)
 #  define _CCCL_DIAG_SUPPRESS_NVHPC(str) _Pragma(_CCCL_TOSTRING(diag_suppress str))
 #  define _CCCL_DIAG_SUPPRESS_MSVC(str)
+#  define _CCCL_DIAG_SUPPRESS_ICC(str)
 #elif defined(_CCCL_COMPILER_MSVC)
 #  define _CCCL_DIAG_PUSH __pragma(warning(push))
 #  define _CCCL_DIAG_POP  __pragma(warning(pop))
@@ -53,6 +64,7 @@
 #  define _CCCL_DIAG_SUPPRESS_GCC(str)
 #  define _CCCL_DIAG_SUPPRESS_NVHPC(str)
 #  define _CCCL_DIAG_SUPPRESS_MSVC(str) __pragma(warning(disable : str))
+#  define _CCCL_DIAG_SUPPRESS_ICC(str)
 #else
 #  define _CCCL_DIAG_PUSH
 #  define _CCCL_DIAG_POP
@@ -60,6 +72,7 @@
 #  define _CCCL_DIAG_SUPPRESS_GCC(str)
 #  define _CCCL_DIAG_SUPPRESS_NVHPC(str)
 #  define _CCCL_DIAG_SUPPRESS_MSVC(str)
+#  define _CCCL_DIAG_SUPPRESS_ICC(str)
 #endif
 
 // Convenient shortcuts to silence common warnings
@@ -94,7 +107,10 @@
 #    if defined(_CCCL_COMPILER_MSVC)
 #      define _CCCL_NV_DIAG_SUPPRESS(_WARNING) __pragma(_CCCL_TOSTRING(nv_diag_suppress _WARNING))
 #      define _CCCL_NV_DIAG_DEFAULT(_WARNING)  __pragma(_CCCL_TOSTRING(nv_diag_default _WARNING))
-#    else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#    elif defined(_CCCL_COMPILER_ICC) // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER_ICCvvv
+#      define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _Pragma(_CCCL_TOSTRING(nv_diag_suppress _WARNING))
+#      define _CCCL_NV_DIAG_DEFAULT(_WARNING)  _Pragma(_CCCL_TOSTRING(nv_diag_default _WARNING))
+#    else // ^^^ _CCCL_COMPILER_ICC^^^ / vvv !_CCCL_COMPILER_{MSVC,ICC} vvv
 #      define _CCCL_NV_DIAG_SUPPRESS(_WARNING) \
         _Pragma(_CCCL_TOSTRING(nv_diagnostic push)) _Pragma(_CCCL_TOSTRING(nv_diag_suppress _WARNING))
 #      define _CCCL_NV_DIAG_DEFAULT(_WARNING) _Pragma(_CCCL_TOSTRING(nv_diagnostic pop))
diff --git a/libcudacxx/include/cuda/std/bitset b/libcudacxx/include/cuda/std/bitset
new file mode 100644
index 0000000000..78cd67857a
--- /dev/null
+++ b/libcudacxx/include/cuda/std/bitset
@@ -0,0 +1,1071 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_BITSET
+#define _CUDA_STD_BITSET
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__algorithm/count.h>
+#include <cuda/std/__algorithm/fill.h>
+#include <cuda/std/__algorithm/find.h>
+#include <cuda/std/__bit/reference.h>
+#include <cuda/std/__functional/hash.h>
+#include <cuda/std/__functional/unary_function.h>
+#include <cuda/std/__type_traits/is_char_like_type.h>
+#include <cuda/std/climits>
+#include <cuda/std/cstddef>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/__string>
+#include <cuda/std/detail/libcxx/include/stdexcept>
+#if defined(_LIBCUDACXX_HAS_STRING_VIEW)
+#  include <cuda/std/string_view>
+#endif // _LIBCUDACXX_HAS_STRING_VIEW
+#include <cuda/std/version>
+
+// standard-mandated includes
+
+// [bitset.syn]
+#include <cuda/std/detail/libcxx/include/iosfwd>
+#if defined(_LIBCUDACXX_HAS_STRING)
+#  include <cuda/std/string>
+#endif // _LIBCUDACXX_HAS_STRING
+
+_CCCL_PUSH_MACROS
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Int>
+struct __avoid_promotions
+{
+  using __base = __conditional_t<(sizeof(_Int) >= sizeof(int)),
+                                 _Int,
+                                 __conditional_t<is_unsigned<_Int>::value, unsigned int, signed int>>;
+
+  constexpr __avoid_promotions() = default;
+
+  template <class _Tp, typename = __enable_if_t<_CCCL_TRAIT(is_integral, _Tp)>>
+  _CCCL_HOST_DEVICE constexpr __avoid_promotions(_Tp __i)
+      : __data(static_cast<_Int>(__i))
+  {}
+
+  _CCCL_HOST_DEVICE constexpr explicit operator bool() const
+  {
+    return static_cast<bool>(__data);
+  }
+
+  // helper for fill_n
+  _CCCL_HOST_DEVICE constexpr friend _Int __convert_to_integral(__avoid_promotions __self)
+  {
+    return __self.__data;
+  }
+
+#define _DEFINE_UNARY(__op)                                                                         \
+  _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op(__avoid_promotions __operand) \
+  {                                                                                                 \
+    return __avoid_promotions(static_cast<_Int>(__op static_cast<__base>(__operand.__data)));       \
+  }
+
+  _DEFINE_UNARY(~)
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to an unsigned type
+  _DEFINE_UNARY(-)
+  _CCCL_DIAG_POP
+#undef _DEFINE_UNARY
+
+#define _DEFINE_SHIFT(__op)                                                                                            \
+  template <class _Tp>                                                                                                 \
+  _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op(__avoid_promotions __operand, _Tp __n)           \
+  {                                                                                                                    \
+    return __avoid_promotions(static_cast<_Int>(static_cast<__base>(__operand.__data) __op static_cast<__base>(__n))); \
+  }                                                                                                                    \
+  template <class _Tp>                                                                                                 \
+  _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op(                                                 \
+    __avoid_promotions __operand, __avoid_promotions<_Tp> __n)                                                         \
+  {                                                                                                                    \
+    return __avoid_promotions(                                                                                         \
+      static_cast<_Int>(static_cast<__base>(__operand.__data) __op static_cast<__base>(__n.__data)));                  \
+  }
+
+  _DEFINE_SHIFT(<<)
+  _DEFINE_SHIFT(>>)
+#undef _DEFINE_SHIFT
+
+#define _DEFINE_SHIFT_ASSIGNMENT(__op)                                                       \
+  template <class _Tp>                                                                       \
+  _CCCL_HOST_DEVICE _CCCL_CONSTEXPR_CXX14 __avoid_promotions& operator __op##=(_Tp __n)      \
+  {                                                                                          \
+    if (__n >= sizeof(_Int) * CHAR_BIT)                                                      \
+    {                                                                                        \
+      __data = 0;                                                                            \
+    }                                                                                        \
+    else                                                                                     \
+    {                                                                                        \
+      __data = static_cast<_Int>(static_cast<__base>(__data) __op static_cast<__base>(__n)); \
+    }                                                                                        \
+    return *this;                                                                            \
+  }
+
+  _DEFINE_SHIFT_ASSIGNMENT(<<)
+  _DEFINE_SHIFT_ASSIGNMENT(>>)
+#undef _DEFINE_SHIFT_ASSIGNMENT
+
+#define _DEFINE_BINARY(__op)                                                                        \
+  _CCCL_HOST_DEVICE constexpr friend __avoid_promotions operator __op(                              \
+    __avoid_promotions __lhs, __avoid_promotions __rhs)                                             \
+  {                                                                                                 \
+    return __avoid_promotions(                                                                      \
+      static_cast<_Int>(static_cast<__base>(__lhs.__data) __op static_cast<__base>(__rhs.__data))); \
+  }
+
+  _DEFINE_BINARY(+)
+  _DEFINE_BINARY(-)
+  _DEFINE_BINARY(*)
+  _DEFINE_BINARY(/)
+  _DEFINE_BINARY(%)
+  _DEFINE_BINARY(&)
+  _DEFINE_BINARY(|)
+  _DEFINE_BINARY(^)
+#undef _DEFINE_BINARY
+
+#define _DEFINE_ASSIGNMENT(__op)                                                                         \
+  _CCCL_HOST_DEVICE _CCCL_CONSTEXPR_CXX14 __avoid_promotions& operator __op##=(__avoid_promotions __rhs) \
+  {                                                                                                      \
+    __data = static_cast<_Int>(static_cast<__base>(__data) __op static_cast<__base>(__rhs.__data));      \
+    return *this;                                                                                        \
+  }
+
+  _DEFINE_ASSIGNMENT(+)
+  _DEFINE_ASSIGNMENT(-)
+  _DEFINE_ASSIGNMENT(*)
+  _DEFINE_ASSIGNMENT(/)
+  _DEFINE_ASSIGNMENT(%)
+  _DEFINE_ASSIGNMENT(&)
+  _DEFINE_ASSIGNMENT(|)
+  _DEFINE_ASSIGNMENT(^)
+#undef _DEFINE_ASSIGNMENT
+
+#define _DEFINE_COMPARISON(__op)                                                                            \
+  _CCCL_HOST_DEVICE constexpr friend bool operator __op(__avoid_promotions __lhs, __avoid_promotions __rhs) \
+  {                                                                                                         \
+    return static_cast<_Int>(static_cast<__base>(__lhs.__data) __op static_cast<__base>(__rhs.__data));     \
+  }
+
+  _DEFINE_COMPARISON(<)
+  _DEFINE_COMPARISON(>)
+  _DEFINE_COMPARISON(==)
+#if _CCCL_STD_VER <= 2017
+  _DEFINE_COMPARISON(!=)
+#endif
+#undef _DEFINE_COMPARISON
+
+  _Int __data;
+};
+
+static_assert(sizeof(__avoid_promotions<uint_least8_t>) == sizeof(uint_least8_t), "");
+static_assert(sizeof(__avoid_promotions<uint_least16_t>) == sizeof(uint_least16_t), "");
+static_assert(sizeof(__avoid_promotions<uint_least32_t>) == sizeof(uint_least32_t), "");
+
+template <size_t _N_words, size_t _Size>
+class __bitset
+{
+public:
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef __avoid_promotions<uint32_t> __storage_type;
+
+protected:
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  __storage_type __first_[_N_words];
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr __storage_type
+  __clip_top_word_to_size(unsigned long long __v)
+  {
+    return _Size >= 2 * __bits_per_word
+           ? static_cast<__storage_type>(__v >> __bits_per_word)
+           : static_cast<__storage_type>(
+               (__v >> __bits_per_word) & ((__storage_type(1) << (_Size - __bits_per_word)) - 1));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept
+      : __first_{0}
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept
+      : __first_{static_cast<__storage_type>(__v), __clip_top_word_to_size(__v)}
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
+  __make_ref(size_t __pos) noexcept
+  {
+    return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference
+  __make_ref(size_t __pos) const noexcept
+  {
+    return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator
+  __make_iter(size_t __pos) noexcept
+  {
+    return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator
+  __make_iter(size_t __pos) const noexcept
+  {
+    return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  operator&=(const __bitset& __v) noexcept
+  {
+    for (size_type __i = 0; __i < _N_words; ++__i)
+    {
+      __first_[__i] &= __v.__first_[__i];
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  operator|=(const __bitset& __v) noexcept
+  {
+    for (size_type __i = 0; __i < _N_words; ++__i)
+    {
+      __first_[__i] |= __v.__first_[__i];
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  operator^=(const __bitset& __v) noexcept
+  {
+    for (size_type __i = 0; __i < _N_words; ++__i)
+    {
+      __first_[__i] ^= __v.__first_[__i];
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bitset&
+  operator<<=(size_t __pos) noexcept
+  {
+    __pos = _CUDA_VSTD::min(__pos, _Size);
+    _CUDA_VSTD::copy_backward(__make_iter(0), __make_iter(_Size - __pos), __make_iter(_Size));
+    _CUDA_VSTD::fill_n(__make_iter(0), __pos, false);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bitset&
+  operator>>=(size_t __pos) noexcept
+  {
+    __pos = _CUDA_VSTD::min(__pos, _Size);
+    _CUDA_VSTD::copy(__make_iter(__pos), __make_iter(_Size), __make_iter(0));
+    _CUDA_VSTD::fill_n(__make_iter(_Size - __pos), __pos, false);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept
+  {
+    // do middle whole words
+    size_type __n         = _Size;
+    __storage_pointer __p = __first_;
+    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+    {
+      *__p = ~*__p;
+    }
+    // do last partial word
+    if (__n > 0)
+    {
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b = *__p & __m;
+      *__p &= ~__m;
+      *__p |= ~__b & __m;
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  {
+    return to_ulong(integral_constant<bool, _Size <= sizeof(unsigned long) * CHAR_BIT>());
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  {
+    return to_ullong(integral_constant<bool, _Size <= sizeof(unsigned long long) * CHAR_BIT>());
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  {
+    // do middle whole words
+    size_type __n               = _Size;
+    __const_storage_pointer __p = __first_;
+    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+    {
+      if (~*__p)
+      {
+        return false;
+      }
+    }
+    // do last partial word
+    if (__n > 0)
+    {
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      if (~*__p & __m)
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  {
+    // do middle whole words
+    size_type __n               = _Size;
+    __const_storage_pointer __p = __first_;
+    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+    {
+      if (*__p)
+      {
+        return true;
+      }
+    }
+    // do last partial word
+    if (__n > 0)
+    {
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      if (*__p & __m)
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    size_t __h = 0;
+    for (size_type __i = 0; __i < _N_words; ++__i)
+    {
+      __h ^= __first_[__i];
+    }
+    return __h;
+  }
+
+private:
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(false_type) const
+  {
+    const_iterator __e = __make_iter(_Size);
+    const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
+    if (__i != __e)
+    {
+      _CUDA_VSTD::__throw_overflow_error("bitset to_ulong overflow error");
+    }
+
+    return to_ulong(true_type());
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(true_type) const
+  {
+    return to_ulong(true_type(), integral_constant<bool, sizeof(__storage_type) <= sizeof(unsigned long)>());
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long
+  to_ulong(true_type, false_type) const
+  {
+    return __first_[0].__data;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long
+  to_ulong(true_type, true_type) const
+  {
+    unsigned long __r = __first_[0].__data;
+    for (size_t __i = 1; __i < sizeof(unsigned long) / sizeof(__storage_type); ++__i)
+    {
+      __r |= static_cast<unsigned long>(__first_[__i].__data) << (__i * sizeof(__storage_type) * CHAR_BIT);
+    }
+    return __r;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
+  to_ullong(false_type) const
+  {
+    const_iterator __e = __make_iter(_Size);
+    const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
+    if (__i != __e)
+    {
+      _CUDA_VSTD::__throw_overflow_error("bitset to_ullong overflow error");
+    }
+
+    return to_ullong(true_type());
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
+  to_ullong(true_type) const
+  {
+    return to_ullong(true_type(), integral_constant<bool, sizeof(__storage_type) <= sizeof(unsigned long long)>());
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
+  to_ullong(true_type, false_type) const
+  {
+    return __first_[0].__data;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
+  to_ullong(true_type, true_type) const
+  {
+    unsigned long long __r = __first_[0].__data;
+    for (size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
+    {
+      __r |= static_cast<unsigned long long>(__first_[__i].__data) << (__i * sizeof(__storage_type) * CHAR_BIT);
+    }
+    return __r;
+  }
+};
+
+template <size_t _Size>
+class __bitset<1, _Size>
+{
+public:
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef __avoid_promotions<__conditional_t<_Size <= 8, uint8_t, __conditional_t<_Size <= 16, uint16_t, uint32_t>>>
+    __storage_type;
+
+protected:
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  __storage_type __first_;
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept
+      : __first_(0)
+  {}
+
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4293) // shift count negative or too big
+                                 // MSVC is slightly overeager with diagnosing that here
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept
+      : __first_(_Size == __bits_per_word
+                   ? static_cast<__storage_type>(__v)
+                   : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - __storage_type(1)))
+  {}
+  _CCCL_DIAG_POP
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
+  __make_ref(size_t __pos) noexcept
+  {
+    return reference(&__first_, __storage_type(1) << __pos);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference
+  __make_ref(size_t __pos) const noexcept
+  {
+    return const_reference(&__first_, __storage_type(1) << __pos);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator
+  __make_iter(size_t __pos) noexcept
+  {
+    return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator
+  __make_iter(size_t __pos) const noexcept
+  {
+    return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  operator&=(const __bitset& __v) noexcept
+  {
+    __first_ &= __v.__first_;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  operator|=(const __bitset& __v) noexcept
+  {
+    __first_ |= __v.__first_;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  operator^=(const __bitset& __v) noexcept
+  {
+    __first_ ^= __v.__first_;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t __rhs) noexcept
+  {
+    __first_ <<= __rhs;
+    __first_ &= ~__storage_type(0) >> (__bits_per_word - _Size);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t __rhs) noexcept
+  {
+    __first_ >>= __rhs;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+    __first_           = ~__first_;
+    __first_ &= __m;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  {
+#ifdef _CCCL_COMPILER_MSVC
+    if (static_cast<unsigned long>(__first_.__data) != __first_.__data)
+    {
+      _CUDA_VSTD::__throw_overflow_error("bitset to_ulong overflow error");
+    }
+    return static_cast<unsigned long>(__first_.__data);
+#else // ^^ MSVC ^^ | vv !MSVC vv
+    return __first_.__data;
+#endif // !MSVC
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  {
+    return __first_.__data;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+    return !static_cast<bool>(~__first_ & __m);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+    return static_cast<bool>(__first_ & __m);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    return __first_;
+  }
+};
+
+template <>
+class __bitset<0, 0>
+{
+public:
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef __avoid_promotions<uint32_t> __storage_type;
+
+protected:
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long) noexcept {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference __make_ref(size_t) noexcept
+  {
+    return reference(nullptr, 1);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept
+  {
+    return const_reference(nullptr, 1);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator __make_iter(size_t) noexcept
+  {
+    return iterator(nullptr, 0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator
+  __make_iter(size_t) const noexcept
+  {
+    return const_iterator(nullptr, 0);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator&=(const __bitset&) noexcept
+  {}
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator|=(const __bitset&) noexcept
+  {}
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator^=(const __bitset&) noexcept
+  {}
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t) noexcept {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  {
+    return 0;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  {
+    return 0;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  {
+    return true;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  {
+    return false;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    return 0;
+  }
+};
+
+template <size_t _Size>
+class _LIBCUDACXX_TEMPLATE_VIS bitset;
+template <size_t _Size>
+struct hash<bitset<_Size>>;
+
+template <size_t _Size>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+__throw_if_out_of_range(size_t __pos, const char* __msg)
+{
+  if (__pos >= _Size)
+  {
+    _CUDA_VSTD::__throw_out_of_range(__msg);
+  }
+}
+
+template <>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void
+__throw_if_out_of_range<0>(size_t __pos, const char* __msg)
+{
+  _CUDA_VSTD::__throw_out_of_range(__msg);
+}
+
+template <size_t _Size>
+class _LIBCUDACXX_TEMPLATE_VIS bitset : private __bitset<_Size == 0 ? 0 : (_Size - 1) / 32 + 1, _Size>
+{
+public:
+  static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / 32 + 1;
+  typedef __bitset<__n_words, _Size> base;
+
+public:
+  typedef typename base::reference reference;
+  typedef typename base::const_reference const_reference;
+
+  // 23.3.5.1 constructors:
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset(unsigned long long __v) noexcept
+      : base(__v)
+  {}
+  template <class _CharT, class = __enable_if_t<_IsCharLikeType<_CharT>::value>>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit bitset(
+    const _CharT* __str, size_t __n = static_cast<size_t>(-1), _CharT __zero = _CharT('0'), _CharT __one = _CharT('1'))
+  {
+    size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str));
+#if defined(_LIBCUDACXX_HAS_STRING_VIEW)
+    __init_from_string_view(basic_string_view<_CharT>(__str, __rlen), __zero, __one);
+#else
+    __init_from_cstr(__str, __rlen, __zero, __one);
+#endif
+  }
+#if defined(_LIBCUDACXX_HAS_STRING_VIEW)
+  template <class _CharT, class _Traits>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit bitset(
+    basic_string_view<_CharT, _Traits> __str,
+    typename basic_string_view<_CharT, _Traits>::size_type __pos = 0,
+    typename basic_string_view<_CharT, _Traits>::size_type __n   = basic_string_view<_CharT, _Traits>::npos,
+    _CharT __zero                                                = _CharT('0'),
+    _CharT __one                                                 = _CharT('1'))
+  {
+    if (__pos > __str.size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("bitset string pos out of range");
+    }
+
+    size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos);
+    __init_from_string_view(basic_string_view<_CharT, _Traits>(__str.data() + __pos, __rlen), __zero, __one);
+  }
+#endif // defined(_LIBCUDACXX_HAS_STRING_VIEW)
+#if defined(_LIBCUDACXX_HAS_STRING)
+  template <class _CharT, class _Traits, class _Allocator>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit bitset(
+    const basic_string<_CharT, _Traits, _Allocator>& __str,
+    typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0,
+    typename basic_string<_CharT, _Traits, _Allocator>::size_type __n = basic_string<_CharT, _Traits, _Allocator>::npos,
+    _CharT __zero                                                     = _CharT('0'),
+    _CharT __one                                                      = _CharT('1'))
+  {
+    if (__pos > __str.size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("bitset string pos out of range");
+    }
+
+    size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos);
+    __init_from_string_view(basic_string_view<_CharT, _Traits>(__str.data() + __pos, __rlen), __zero, __one);
+  }
+#endif // defined(_LIBCUDACXX_HAS_STRING)
+
+  // 23.3.5.2 bitset operations:
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
+  operator&=(const bitset& __rhs) noexcept
+  {
+    base::operator&=(__rhs);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
+  operator|=(const bitset& __rhs) noexcept
+  {
+    base::operator|=(__rhs);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
+  operator^=(const bitset& __rhs) noexcept
+  {
+    base::operator^=(__rhs);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
+  operator<<=(size_t __rhs) noexcept
+  {
+    base::operator<<=(__rhs);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
+  operator>>=(size_t __rhs) noexcept
+  {
+    base::operator>>=(__rhs);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& set() noexcept
+  {
+    _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
+  set(size_t __pos, bool __val = true)
+  {
+    _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset set argument out of range");
+
+    (*this)[__pos] = __val;
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& reset() noexcept
+  {
+    _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false);
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& reset(size_t __pos)
+  {
+    _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset reset argument out of range");
+
+    (*this)[__pos] = false;
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset operator~() const noexcept
+  {
+    bitset __x(*this);
+    __x.flip();
+    return __x;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& flip() noexcept
+  {
+    base::flip();
+    return *this;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& flip(size_t __pos)
+  {
+    _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset flip argument out of range");
+
+    reference __r = base::__make_ref(__pos);
+    __r           = ~__r;
+    return *this;
+  }
+
+  // element access:
+#ifdef _LIBCUDACXX_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator[](size_t __p) const
+  {
+    return base::__make_ref(__p);
+  }
+#else
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference operator[](size_t __p) const
+  {
+    return base::__make_ref(__p);
+  }
+#endif
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator[](size_t __p)
+  {
+    return base::__make_ref(__p);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  {
+    return base::to_ulong();
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  {
+    return base::to_ullong();
+  }
+
+#if defined(_LIBCUDACXX_HAS_STRING)
+  template <class _CharT, class _Traits, class _Allocator>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, _Allocator>
+  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const
+  {
+    basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
+    for (size_t __i = 0; __i != _Size; ++__i)
+    {
+      if ((*this)[__i])
+      {
+        __r[_Size - 1 - __i] = __one;
+      }
+    }
+    return __r;
+  }
+
+  template <class _CharT, class _Traits>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, allocator<_CharT>>
+  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const
+  {
+    return to_string<_CharT, _Traits, allocator<_CharT>>(__zero, __one);
+  }
+
+  template <class _CharT>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
+  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const
+  {
+    return to_string<_CharT, char_traits<_CharT>, allocator<_CharT>>(__zero, __one);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _CCCL_CONSTEXPR_CXX14 basic_string<char, char_traits<char>, allocator<char>>
+  to_string(char __zero = '0', char __one = '1') const
+  {
+    return to_string<char, char_traits<char>, allocator<char>>(__zero, __one);
+  }
+#endif // defined(_LIBCUDACXX_HAS_STRING)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 size_t count() const noexcept
+  {
+    return static_cast<size_t>(_CUDA_VSTD::count(base::__make_iter(0), base::__make_iter(_Size), true));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept
+  {
+    return _Size;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+  operator==(const bitset& __rhs) const noexcept
+  {
+    return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0));
+  }
+
+#if _CCCL_STD_VER <= 2017
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+  operator!=(const bitset& __rhs) const noexcept
+  {
+    return !(*this == __rhs);
+  }
+#endif // C++ <= 17
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool test(size_t __pos) const
+  {
+    _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset test argument out of range");
+
+    return (*this)[__pos];
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  {
+    return base::all();
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  {
+    return base::any();
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool none() const noexcept
+  {
+    return !any();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset
+  operator<<(size_t __pos) const noexcept
+  {
+    bitset __r = *this;
+    __r <<= __pos;
+    return __r;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset
+  operator>>(size_t __pos) const noexcept
+  {
+    bitset __r = *this;
+    __r >>= __pos;
+    return __r;
+  }
+
+private:
+#if defined(_LIBCUDACXX_HAS_STRING_VIEW)
+  template <class _CharT, class _Traits>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  __init_from_string_view(basic_string_view<_CharT, _Traits> __str, _CharT __zero, _CharT __one)
+  {
+    for (size_t __i = 0; __i < __str.size(); ++__i)
+    {
+      if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one))
+      {
+        _CUDA_VSTD::__throw_invalid_argument("bitset string ctor has invalid argument");
+      }
+    }
+
+    size_t __mp = _CUDA_VSTD::min(__str.size(), _Size);
+    size_t __i  = 0;
+    for (; __i < __mp; ++__i)
+    {
+      _CharT __c   = __str[__mp - 1 - __i];
+      (*this)[__i] = _Traits::eq(__c, __one);
+    }
+    _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
+  }
+#else // ^^ _LIBCUDACXX_HAS_STRING_VIEW ^^ | vv !_LIBCUDACXX_HAS_STRING_VIEW vv
+  template <class _CharT, class _Traits = char_traits<_CharT>>
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  __init_from_cstr(const _CharT* __str, size_t __size, _CharT __zero, _CharT __one)
+  {
+    for (size_t __i = 0; __i < __size; ++__i)
+    {
+      if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one))
+      {
+        _CUDA_VSTD::__throw_invalid_argument("bitset string ctor has invalid argument");
+      }
+    }
+
+    size_t __mp = _CUDA_VSTD::min(__size, _Size);
+    size_t __i  = 0;
+    for (; __i < __mp; ++__i)
+    {
+      _CharT __c   = __str[__mp - 1 - __i];
+      (*this)[__i] = _Traits::eq(__c, __one);
+    }
+    _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
+  }
+#endif // !_LIBCUDACXX_HAS_STRING_VIEW
+
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    return base::__hash_code();
+  }
+
+  friend struct hash<bitset>;
+};
+
+template <size_t _Size>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size>
+operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+{
+  bitset<_Size> __r = __x;
+  __r &= __y;
+  return __r;
+}
+
+template <size_t _Size>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size>
+operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+{
+  bitset<_Size> __r = __x;
+  __r |= __y;
+  return __r;
+}
+
+template <size_t _Size>
+inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size>
+operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+{
+  bitset<_Size> __r = __x;
+  __r ^= __y;
+  return __r;
+}
+
+template <size_t _Size>
+struct _LIBCUDACXX_TEMPLATE_VIS hash<bitset<_Size>> : public __unary_function<bitset<_Size>, size_t>
+{
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const bitset<_Size>& __bs) const noexcept
+  {
+    return __bs.__hash_code();
+  }
+};
+
+template <class _CharT, class _Traits, size_t _Size>
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY basic_istream<_CharT, _Traits>&
+operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x);
+
+template <class _CharT, class _Traits, size_t _Size>
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x);
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+_CCCL_POP_MACROS
+
+#endif // _CUDA_STD_BITSET
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__string b/libcudacxx/include/cuda/std/detail/libcxx/include/__string
new file mode 100644
index 0000000000..1f0517b99d
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__string
@@ -0,0 +1,1246 @@
+// -*- C++ -*-
+//===-------------------------- __string ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___STRING
+#define _LIBCUDACXX___STRING
+
+/*
+    string synopsis
+
+namespace std
+{
+
+template <class charT>
+struct char_traits
+{
+    typedef charT     char_type;
+    typedef ...       int_type;
+    typedef streamoff off_type;
+    typedef streampos pos_type;
+    typedef mbstate_t state_type;
+
+    static constexpr void assign(char_type& c1, const char_type& c2) noexcept;
+    static constexpr bool eq(char_type c1, char_type c2) noexcept;
+    static constexpr bool lt(char_type c1, char_type c2) noexcept;
+
+    static constexpr int    compare(const char_type* s1, const char_type* s2, size_t n);
+    static constexpr size_t length(const char_type* s);
+    static constexpr const char_type*
+                            find(const char_type* s, size_t n, const char_type& a);
+    static char_type*       move(char_type* s1, const char_type* s2, size_t n);
+    static char_type*       copy(char_type* s1, const char_type* s2, size_t n);
+    static char_type*       assign(char_type* s, size_t n, char_type a);
+
+    static constexpr int_type  not_eof(int_type c) noexcept;
+    static constexpr char_type to_char_type(int_type c) noexcept;
+    static constexpr int_type  to_int_type(char_type c) noexcept;
+    static constexpr bool      eq_int_type(int_type c1, int_type c2) noexcept;
+    static constexpr int_type  eof() noexcept;
+};
+
+template <> struct char_traits<char>;
+template <> struct char_traits<wchar_t>;
+template <> struct char_traits<char8_t>;  // c++20
+
+}  // std
+
+*/
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__algorithm/copy.h>
+#include <cuda/std/__algorithm/find_end.h>
+#include <cuda/std/__algorithm/find_first_of.h>
+#include <cuda/std/__algorithm/iterator_operations.h>
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__algorithm/search.h>
+#include <cuda/std/__fwd/string.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/__debug>
+#include <cuda/std/detail/libcxx/include/iosfwd>
+
+_CCCL_PUSH_MACROS
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// char_traits
+
+template <class _CharT>
+struct _LIBCUDACXX_TEMPLATE_VIS char_traits
+{
+  typedef _CharT char_type;
+  typedef int int_type;
+  typedef streamoff off_type;
+#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+  typedef streampos pos_type;
+  typedef mbstate_t state_type;
+#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline void _CCCL_CONSTEXPR_CXX14
+  assign(char_type& __c1, const char_type& __c2) noexcept
+  {
+    __c1 = __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 < __c2;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  compare(const char_type* __s1, const char_type* __s2, size_t __n);
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s);
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  find(const char_type* __s, size_t __n, const char_type& __a);
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n);
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n);
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a);
+
+#ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  {
+    return eq_int_type(__c, eof()) ? ~eof() : __c;
+  }
+#endif // !__cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  {
+    return char_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  {
+    return int_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+#ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  {
+    return int_type(EOF);
+  }
+#endif // !__cuda_std__
+};
+
+template <class _CharT>
+_CCCL_CONSTEXPR_CXX14 int char_traits<_CharT>::compare(const char_type* __s1, const char_type* __s2, size_t __n)
+{
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    if (lt(*__s1, *__s2))
+    {
+      return -1;
+    }
+    if (lt(*__s2, *__s1))
+    {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+template <class _CharT>
+inline _CCCL_CONSTEXPR_CXX14 size_t char_traits<_CharT>::length(const char_type* __s)
+{
+  size_t __len = 0;
+  for (; !eq(*__s, char_type(0)); ++__s)
+  {
+    ++__len;
+  }
+  return __len;
+}
+
+template <class _CharT>
+inline _CCCL_CONSTEXPR_CXX14 const _CharT*
+char_traits<_CharT>::find(const char_type* __s, size_t __n, const char_type& __a)
+{
+  for (; __n; --__n)
+  {
+    if (eq(*__s, __a))
+    {
+      return __s;
+    }
+    ++__s;
+  }
+  return 0;
+}
+
+template <class _CharT>
+inline _LIBCUDACXX_INLINE_VISIBILITY _CharT*
+char_traits<_CharT>::move(char_type* __s1, const char_type* __s2, size_t __n)
+{
+  char_type* __r = __s1;
+  if (__s1 < __s2)
+  {
+    for (; __n; --__n, ++__s1, ++__s2)
+    {
+      assign(*__s1, *__s2);
+    }
+  }
+  else if (__s2 < __s1)
+  {
+    __s1 += __n;
+    __s2 += __n;
+    for (; __n; --__n)
+    {
+      assign(*--__s1, *--__s2);
+    }
+  }
+  return __r;
+}
+
+template <class _CharT>
+inline _LIBCUDACXX_INLINE_VISIBILITY _CharT*
+char_traits<_CharT>::copy(char_type* __s1, const char_type* __s2, size_t __n)
+{
+  _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
+  char_type* __r = __s1;
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    assign(*__s1, *__s2);
+  }
+  return __r;
+}
+
+template <class _CharT>
+inline _LIBCUDACXX_INLINE_VISIBILITY _CharT* char_traits<_CharT>::assign(char_type* __s, size_t __n, char_type __a)
+{
+  char_type* __r = __s;
+  for (; __n; --__n, ++__s)
+  {
+    assign(*__s, __a);
+  }
+  return __r;
+}
+
+// char_traits<char>
+
+// GCC's builtin_strlen isn't reliable at constexpr time
+// MSVC does not expose builtin_strlen before C++17
+#if defined(_CCCL_COMPILER_GCC) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_STD_VER < 2017)
+#  define _CCCL_HAS_NO_BUILTIN_STRLEN
+#endif
+
+template <>
+struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char>
+{
+  typedef char char_type;
+  typedef int int_type;
+  typedef streamoff off_type;
+#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+  typedef streampos pos_type;
+  typedef mbstate_t state_type;
+#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
+  assign(char_type& __c1, const char_type& __c2) noexcept
+  {
+    __c1 = __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  {
+    return (unsigned char) __c1 < (unsigned char) __c2;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static inline size_t _CCCL_CONSTEXPR_CXX14 length(const char_type* __s) noexcept
+  {
+#ifdef _CCCL_HAS_NO_BUILTIN_STRLEN
+#  ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED // is_constant_evaluated only exists since GCC 9
+    if (__libcpp_is_constant_evaluated())
+#  endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+    {
+      size_t __len = 0;
+      for (; !eq(*__s, char(0)); ++__s)
+      {
+        ++__len;
+      }
+      return __len;
+    }
+#endif // defined(_CCCL_HAS_NO_BUILTIN_STRLEN)
+#if !defined(_CCCL_HAS_NO_BUILTIN_STRLEN) || defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                      (size_t __len = 0; for (; !eq(*__s, char(0)); ++__s)++ __len; return __len;),
+                      (return __builtin_strlen(__s);))
+#endif // !defined(_CCCL_HAS_NO_BUILTIN_STRLEN) || defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
+  move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  {
+    return __n == 0 ? __s1 : (char_type*) __copy<_ClassicAlgPolicy>(__s2, __s2 + __n, __s1).first - __n;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
+  copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  {
+    _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
+    return __n == 0 ? __s1 : (char_type*) memcpy(__s1, __s2, __n);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
+  {
+    return __n == 0 ? __s : (char_type*) memset(__s, to_int_type(__a), __n);
+  }
+
+#ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  {
+    return eq_int_type(__c, eof()) ? ~eof() : __c;
+  }
+#endif // !__cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  {
+    return char_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  {
+    return int_type((unsigned char) __c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+#ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  {
+    return int_type(EOF);
+  }
+#endif // !__cuda_std__
+};
+
+inline _CCCL_CONSTEXPR_CXX14 int
+char_traits<char>::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  if (__n == 0)
+  {
+    return 0;
+  }
+#if __has_feature(cxx_constexpr_string_builtins)
+  return __builtin_memcmp(__s1, __s2, __n);
+#else
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    if (lt(*__s1, *__s2))
+    {
+      return -1;
+    }
+    if (lt(*__s2, *__s1))
+    {
+      return 1;
+    }
+  }
+  return 0;
+#endif // !has_feature(constexpr_string_builtins)
+}
+
+inline _CCCL_CONSTEXPR_CXX14 const char*
+char_traits<char>::find(const char_type* __s, size_t __n, const char_type& __a) noexcept
+{
+  if (__n == 0)
+  {
+    return nullptr;
+  }
+#if __has_feature(cxx_constexpr_string_builtins)
+  return __builtin_char_memchr(__s, to_int_type(__a), __n);
+#else
+  for (; __n; --__n)
+  {
+    if (eq(*__s, __a))
+    {
+      return __s;
+    }
+    ++__s;
+  }
+  return nullptr;
+#endif // !has_feature(constexpr_string_builtins)
+}
+
+// char_traits<wchar_t>
+
+#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+template <>
+struct _LIBCUDACXX_TEMPLATE_VIS char_traits<wchar_t>
+{
+  typedef wchar_t char_type;
+  typedef wint_t int_type;
+  typedef streamoff off_type;
+  typedef streampos pos_type;
+  typedef mbstate_t state_type;
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
+  assign(char_type& __c1, const char_type& __c2) noexcept
+  {
+    __c1 = __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 < __c2;
+  }
+
+  static _CCCL_CONSTEXPR_CXX14 int compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
+  static _CCCL_CONSTEXPR_CXX14 const char_type* find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
+  move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  {
+    return __n == 0 ? __s1 : (char_type*) wmemmove(__s1, __s2, __n);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
+  copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  {
+    _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
+    return __n == 0 ? __s1 : (char_type*) wmemcpy(__s1, __s2, __n);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
+  {
+    return __n == 0 ? __s : (char_type*) wmemset(__s, __a, __n);
+  }
+
+#  ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  {
+    return eq_int_type(__c, eof()) ? ~eof() : __c;
+  }
+#  endif // !__cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  {
+    return char_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  {
+    return int_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+#  ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  {
+    return int_type(WEOF);
+  }
+#  endif // !__cuda_std__
+};
+
+inline _CCCL_CONSTEXPR_CXX14 int
+char_traits<wchar_t>::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  if (__n == 0)
+  {
+    return 0;
+  }
+#  if __has_feature(cxx_constexpr_string_builtins)
+  return __builtin_wmemcmp(__s1, __s2, __n);
+#  elif _CCCL_STD_VER <= 2014
+  return wmemcmp(__s1, __s2, __n);
+#  else
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    if (lt(*__s1, *__s2))
+    {
+      return -1;
+    }
+    if (lt(*__s2, *__s1))
+    {
+      return 1;
+    }
+  }
+  return 0;
+#  endif
+}
+#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
+
+template <class _Traits>
+_LIBCUDACXX_INLINE_VISIBILITY inline constexpr size_t
+__char_traits_length_checked(const typename _Traits::char_type* __s) noexcept
+{
+#if _LIBCUDACXX_DEBUG_LEVEL >= 1
+  return __s
+         ? _Traits::length(__s)
+         : (_CUDA_VSTD::__libcpp_debug_function(_CUDA_VSTD::__libcpp_debug_info(
+              __FILE__, __LINE__, "p == nullptr", "null pointer pass to non-null argument of char_traits<...>::length")),
+            0);
+#else
+  return _Traits::length(__s);
+#endif
+}
+
+#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+inline _CCCL_CONSTEXPR_CXX14 size_t char_traits<wchar_t>::length(const char_type* __s) noexcept
+{
+#  if __has_feature(cxx_constexpr_string_builtins)
+  return __builtin_wcslen(__s);
+#  elif _CCCL_STD_VER <= 2014
+  return wcslen(__s);
+#  else
+  size_t __len = 0;
+  for (; !eq(*__s, char_type(0)); ++__s)
+  {
+    ++__len;
+  }
+  return __len;
+#  endif
+}
+
+inline _CCCL_CONSTEXPR_CXX14 const wchar_t*
+char_traits<wchar_t>::find(const char_type* __s, size_t __n, const char_type& __a) noexcept
+{
+  if (__n == 0)
+  {
+    return nullptr;
+  }
+#  if __has_feature(cxx_constexpr_string_builtins)
+  return __builtin_wmemchr(__s, __a, __n);
+#  else
+  for (; __n; --__n)
+  {
+    if (eq(*__s, __a))
+    {
+      return __s;
+    }
+    ++__s;
+  }
+  return nullptr;
+#  endif
+}
+#endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
+
+#ifndef _LIBCUDACXX_NO_HAS_CHAR8_T
+
+template <>
+struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char8_t>
+{
+  typedef char8_t char_type;
+  typedef unsigned int int_type;
+  typedef streamoff off_type;
+#  ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+  typedef u8streampos pos_type;
+  typedef mbstate_t state_type;
+#  endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr void assign(char_type& __c1, const char_type& __c2) noexcept
+  {
+    __c1 = __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 < __c2;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY static constexpr int
+  compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t length(const char_type* __s) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY static constexpr const char_type*
+  find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  {
+    return __n == 0 ? __s1 : (char_type*) __copy<_ClassicAlgPolicy>(__s2, __s2 + __n, __s1).first - __n;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  {
+    _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
+    return __n == 0 ? __s1 : (char_type*) memcpy(__s1, __s2, __n);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
+  {
+    return __n == 0 ? __s : (char_type*) memset(__s, to_int_type(__a), __n);
+  }
+
+#  ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  {
+    return eq_int_type(__c, eof()) ? ~eof() : __c;
+  }
+#  endif // !__cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  {
+    return char_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  {
+    return int_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+#  ifndef __cuda_std__
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  {
+    return int_type(EOF);
+  }
+#  endif // !__cuda_std__
+};
+
+// TODO use '__builtin_strlen' if it ever supports char8_t ??
+inline constexpr size_t char_traits<char8_t>::length(const char_type* __s) noexcept
+{
+  size_t __len = 0;
+  for (; !eq(*__s, char_type(0)); ++__s)
+  {
+    ++__len;
+  }
+  return __len;
+}
+
+inline constexpr int char_traits<char8_t>::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+#  if __has_feature(cxx_constexpr_string_builtins)
+  return __builtin_memcmp(__s1, __s2, __n);
+#  else
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    if (lt(*__s1, *__s2))
+    {
+      return -1;
+    }
+    if (lt(*__s2, *__s1))
+    {
+      return 1;
+    }
+  }
+  return 0;
+#  endif
+}
+
+// TODO use '__builtin_char_memchr' if it ever supports char8_t ??
+inline constexpr const char8_t*
+char_traits<char8_t>::find(const char_type* __s, size_t __n, const char_type& __a) noexcept
+{
+  for (; __n; --__n)
+  {
+    if (eq(*__s, __a))
+    {
+      return __s;
+    }
+    ++__s;
+  }
+  return 0;
+}
+
+#endif // #_LIBCUDACXX_NO_HAS_CHAR8_T
+
+#ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+
+template <>
+struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char16_t>
+{
+  typedef char16_t char_type;
+  typedef uint_least16_t int_type;
+  typedef streamoff off_type;
+#  ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+  typedef u16streampos pos_type;
+  typedef mbstate_t state_type;
+#  endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
+  assign(char_type& __c1, const char_type& __c2) noexcept
+  {
+    __c1 = __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 < __c2;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  {
+    return eq_int_type(__c, eof()) ? ~eof() : __c;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  {
+    return char_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  {
+    return int_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  {
+    return int_type(0xFFFF);
+  }
+};
+
+inline _CCCL_CONSTEXPR_CXX14 int
+char_traits<char16_t>::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    if (lt(*__s1, *__s2))
+    {
+      return -1;
+    }
+    if (lt(*__s2, *__s1))
+    {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+inline _CCCL_CONSTEXPR_CXX14 size_t char_traits<char16_t>::length(const char_type* __s) noexcept
+{
+  size_t __len = 0;
+  for (; !eq(*__s, char_type(0)); ++__s)
+  {
+    ++__len;
+  }
+  return __len;
+}
+
+inline _CCCL_CONSTEXPR_CXX14 const char16_t*
+char_traits<char16_t>::find(const char_type* __s, size_t __n, const char_type& __a) noexcept
+{
+  for (; __n; --__n)
+  {
+    if (eq(*__s, __a))
+    {
+      return __s;
+    }
+    ++__s;
+  }
+  return 0;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY char16_t*
+char_traits<char16_t>::move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  char_type* __r = __s1;
+  if (__s1 < __s2)
+  {
+    for (; __n; --__n, ++__s1, ++__s2)
+    {
+      assign(*__s1, *__s2);
+    }
+  }
+  else if (__s2 < __s1)
+  {
+    __s1 += __n;
+    __s2 += __n;
+    for (; __n; --__n)
+    {
+      assign(*--__s1, *--__s2);
+    }
+  }
+  return __r;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY char16_t*
+char_traits<char16_t>::copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
+  char_type* __r = __s1;
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    assign(*__s1, *__s2);
+  }
+  return __r;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY char16_t*
+char_traits<char16_t>::assign(char_type* __s, size_t __n, char_type __a) noexcept
+{
+  char_type* __r = __s;
+  for (; __n; --__n, ++__s)
+  {
+    assign(*__s, __a);
+  }
+  return __r;
+}
+
+template <>
+struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char32_t>
+{
+  typedef char32_t char_type;
+  typedef uint_least32_t int_type;
+  typedef streamoff off_type;
+#  ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+  typedef u32streampos pos_type;
+  typedef mbstate_t state_type;
+#  endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
+  assign(char_type& __c1, const char_type& __c2) noexcept
+  {
+    __c1 = __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  {
+    return __c1 < __c2;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  {
+    return eq_int_type(__c, eof()) ? ~eof() : __c;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  {
+    return char_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  {
+    return int_type(__c);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  {
+    return __c1 == __c2;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  {
+    return int_type(0xFFFFFFFF);
+  }
+};
+
+inline _CCCL_CONSTEXPR_CXX14 int
+char_traits<char32_t>::compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    if (lt(*__s1, *__s2))
+    {
+      return -1;
+    }
+    if (lt(*__s2, *__s1))
+    {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+inline _CCCL_CONSTEXPR_CXX14 size_t char_traits<char32_t>::length(const char_type* __s) noexcept
+{
+  size_t __len = 0;
+  for (; !eq(*__s, char_type(0)); ++__s)
+  {
+    ++__len;
+  }
+  return __len;
+}
+
+inline _CCCL_CONSTEXPR_CXX14 const char32_t*
+char_traits<char32_t>::find(const char_type* __s, size_t __n, const char_type& __a) noexcept
+{
+  for (; __n; --__n)
+  {
+    if (eq(*__s, __a))
+    {
+      return __s;
+    }
+    ++__s;
+  }
+  return 0;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY char32_t*
+char_traits<char32_t>::move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  char_type* __r = __s1;
+  if (__s1 < __s2)
+  {
+    for (; __n; --__n, ++__s1, ++__s2)
+    {
+      assign(*__s1, *__s2);
+    }
+  }
+  else if (__s2 < __s1)
+  {
+    __s1 += __n;
+    __s2 += __n;
+    for (; __n; --__n)
+    {
+      assign(*--__s1, *--__s2);
+    }
+  }
+  return __r;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY char32_t*
+char_traits<char32_t>::copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+{
+  _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
+  char_type* __r = __s1;
+  for (; __n; --__n, ++__s1, ++__s2)
+  {
+    assign(*__s1, *__s2);
+  }
+  return __r;
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY char32_t*
+char_traits<char32_t>::assign(char_type* __s, size_t __n, char_type __a) noexcept
+{
+  char_type* __r = __s;
+  for (; __n; --__n, ++__s)
+  {
+    assign(*__s, __a);
+  }
+  return __r;
+}
+
+#endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+
+// helper fns for basic_string and string_view
+
+// __str_find
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
+{
+  if (__pos >= __sz)
+  {
+    return __npos;
+  }
+  const _CharT* __r = _Traits::find(__p + __pos, __sz - __pos, __c);
+  if (__r == 0)
+  {
+    return __npos;
+  }
+  return static_cast<_SizeT>(__r - __p);
+}
+
+template <class _CharT, class _Traits>
+inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY const _CharT*
+__search_substring(const _CharT* __first1, const _CharT* __last1, const _CharT* __first2, const _CharT* __last2)
+{
+  // Take advantage of knowing source and pattern lengths.
+  // Stop short when source is smaller than pattern.
+  const ptrdiff_t __len2 = __last2 - __first2;
+  if (__len2 == 0)
+  {
+    return __first1;
+  }
+
+  ptrdiff_t __len1 = __last1 - __first1;
+  if (__len1 < __len2)
+  {
+    return __last1;
+  }
+
+  // First element of __first2 is loop invariant.
+  _CharT __f2 = *__first2;
+  while (true)
+  {
+    __len1 = __last1 - __first1;
+    // Check whether __first1 still has at least __len2 bytes.
+    if (__len1 < __len2)
+    {
+      return __last1;
+    }
+
+    // Find __f2 the first byte matching in __first1.
+    __first1 = _Traits::find(__first1, __len1 - __len2 + 1, __f2);
+    if (__first1 == 0)
+    {
+      return __last1;
+    }
+
+    // It is faster to compare from the first byte of __first1 even if we
+    // already know that it matches the first byte of __first2: this is because
+    // __first2 is most likely aligned, as it is user's "pattern" string, and
+    // __first1 + 1 is most likely not aligned, as the match is in the middle of
+    // the string.
+    if (_Traits::compare(__first1, __first2, __len2) == 0)
+    {
+      return __first1;
+    }
+
+    ++__first1;
+  }
+}
+
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
+{
+  if (__pos > __sz)
+  {
+    return __npos;
+  }
+
+  if (__n == 0) // There is nothing to search, just return __pos.
+  {
+    return __pos;
+  }
+
+  const _CharT* __r = __search_substring<_CharT, _Traits>(__p + __pos, __p + __sz, __s, __s + __n);
+
+  if (__r == __p + __sz)
+  {
+    return __npos;
+  }
+  return static_cast<_SizeT>(__r - __p);
+}
+
+// __str_rfind
+
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_rfind(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
+{
+  if (__sz < 1)
+  {
+    return __npos;
+  }
+  if (__pos < __sz)
+  {
+    ++__pos;
+  }
+  else
+  {
+    __pos = __sz;
+  }
+  for (const _CharT* __ps = __p + __pos; __ps != __p;)
+  {
+    if (_Traits::eq(*--__ps, __c))
+    {
+      return static_cast<_SizeT>(__ps - __p);
+    }
+  }
+  return __npos;
+}
+
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_rfind(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
+{
+  __pos = _CUDA_VSTD::min(__pos, __sz);
+  if (__n < __sz - __pos)
+  {
+    __pos += __n;
+  }
+  else
+  {
+    __pos = __sz;
+  }
+  const _CharT* __r = _CUDA_VSTD::__find_end(
+    __p, __p + __pos, __s, __s + __n, _Traits::eq, random_access_iterator_tag(), random_access_iterator_tag());
+  if (__n > 0 && __r == __p + __pos)
+  {
+    return __npos;
+  }
+  return static_cast<_SizeT>(__r - __p);
+}
+
+// __str_find_first_of
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find_first_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
+{
+  if (__pos >= __sz || __n == 0)
+  {
+    return __npos;
+  }
+  const _CharT* __r = _CUDA_VSTD::__find_first_of_ce(__p + __pos, __p + __sz, __s, __s + __n, _Traits::eq);
+  if (__r == __p + __sz)
+  {
+    return __npos;
+  }
+  return static_cast<_SizeT>(__r - __p);
+}
+
+// __str_find_last_of
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find_last_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
+{
+  if (__n != 0)
+  {
+    if (__pos < __sz)
+    {
+      ++__pos;
+    }
+    else
+    {
+      __pos = __sz;
+    }
+    for (const _CharT* __ps = __p + __pos; __ps != __p;)
+    {
+      const _CharT* __r = _Traits::find(__s, __n, *--__ps);
+      if (__r)
+      {
+        return static_cast<_SizeT>(__ps - __p);
+      }
+    }
+  }
+  return __npos;
+}
+
+// __str_find_first_not_of
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find_first_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
+{
+  if (__pos < __sz)
+  {
+    const _CharT* __pe = __p + __sz;
+    for (const _CharT* __ps = __p + __pos; __ps != __pe; ++__ps)
+    {
+      if (_Traits::find(__s, __n, *__ps) == 0)
+      {
+        return static_cast<_SizeT>(__ps - __p);
+      }
+    }
+  }
+  return __npos;
+}
+
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find_first_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
+{
+  if (__pos < __sz)
+  {
+    const _CharT* __pe = __p + __sz;
+    for (const _CharT* __ps = __p + __pos; __ps != __pe; ++__ps)
+    {
+      if (!_Traits::eq(*__ps, __c))
+      {
+        return static_cast<_SizeT>(__ps - __p);
+      }
+    }
+  }
+  return __npos;
+}
+
+// __str_find_last_not_of
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find_last_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
+{
+  if (__pos < __sz)
+  {
+    ++__pos;
+  }
+  else
+  {
+    __pos = __sz;
+  }
+  for (const _CharT* __ps = __p + __pos; __ps != __p;)
+  {
+    if (_Traits::find(__s, __n, *--__ps) == 0)
+    {
+      return static_cast<_SizeT>(__ps - __p);
+    }
+  }
+  return __npos;
+}
+
+template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
+inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+__str_find_last_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
+{
+  if (__pos < __sz)
+  {
+    ++__pos;
+  }
+  else
+  {
+    __pos = __sz;
+  }
+  for (const _CharT* __ps = __p + __pos; __ps != __p;)
+  {
+    if (!_Traits::eq(*--__ps, __c))
+    {
+      return static_cast<_SizeT>(__ps - __p);
+    }
+  }
+  return __npos;
+}
+
+#ifndef __cuda_std__
+template <class _Ptr>
+inline _LIBCUDACXX_INLINE_VISIBILITY size_t __do_string_hash(_Ptr __p, _Ptr __e)
+{
+  typedef typename iterator_traits<_Ptr>::value_type value_type;
+  return __murmur2_or_cityhash<size_t>()(__p, (__e - __p) * sizeof(value_type));
+}
+#endif // !__cuda_std__
+
+template <class _CharT, class _Iter, class _Traits = char_traits<_CharT>>
+struct __quoted_output_proxy
+{
+  _Iter __first;
+  _Iter __last;
+  _CharT __delim;
+  _CharT __escape;
+
+  _LIBCUDACXX_INLINE_VISIBILITY __quoted_output_proxy(_Iter __f, _Iter __l, _CharT __d, _CharT __e)
+      : __first(__f)
+      , __last(__l)
+      , __delim(__d)
+      , __escape(__e)
+  {}
+  //  This would be a nice place for a string_ref
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+_CCCL_POP_MACROS
+
+#endif // _LIBCUDACXX___STRING
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset b/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
deleted file mode 100644
index d61be09703..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
+++ /dev/null
@@ -1,1027 +0,0 @@
-// -*- C++ -*-
-//===---------------------------- bitset ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_BITSET
-#define _LIBCUDACXX_BITSET
-
-/*
-    bitset synopsis
-
-namespace std
-{
-
-namespace std {
-
-template <size_t N>
-class bitset
-{
-public:
-    // bit reference:
-    class reference
-    {
-        friend class bitset;
-        reference() noexcept;
-    public:
-        ~reference() noexcept;
-        reference& operator=(bool x) noexcept;           // for b[i] = x;
-        reference& operator=(const reference&) noexcept; // for b[i] = b[j];
-        bool operator~() const noexcept;                 // flips the bit
-        operator bool() const noexcept;                  // for x = b[i];
-        reference& flip() noexcept;                      // for b[i].flip();
-    };
-
-    // 23.3.5.1 constructors:
-    constexpr bitset() noexcept;
-    constexpr bitset(unsigned long long val) noexcept;
-    template <class charT>
-        explicit bitset(const charT* str,
-                        typename basic_string<charT>::size_type n = basic_string<charT>::npos,
-                        charT zero = charT('0'), charT one = charT('1'));
-    template<class charT, class traits, class Allocator>
-        explicit bitset(const basic_string<charT,traits,Allocator>& str,
-                        typename basic_string<charT,traits,Allocator>::size_type pos = 0,
-                        typename basic_string<charT,traits,Allocator>::size_type n =
-                                 basic_string<charT,traits,Allocator>::npos,
-                        charT zero = charT('0'), charT one = charT('1'));
-
-    // 23.3.5.2 bitset operations:
-    bitset& operator&=(const bitset& rhs) noexcept;
-    bitset& operator|=(const bitset& rhs) noexcept;
-    bitset& operator^=(const bitset& rhs) noexcept;
-    bitset& operator<<=(size_t pos) noexcept;
-    bitset& operator>>=(size_t pos) noexcept;
-    bitset& set() noexcept;
-    bitset& set(size_t pos, bool val = true);
-    bitset& reset() noexcept;
-    bitset& reset(size_t pos);
-    bitset operator~() const noexcept;
-    bitset& flip() noexcept;
-    bitset& flip(size_t pos);
-
-    // element access:
-    constexpr bool operator[](size_t pos) const; // for b[i];
-    reference operator[](size_t pos);            // for b[i];
-    unsigned long to_ulong() const;
-    unsigned long long to_ullong() const;
-    template <class charT, class traits, class Allocator>
-        basic_string<charT, traits, Allocator> to_string(charT zero = charT('0'), charT one = charT('1')) const;
-    template <class charT, class traits>
-        basic_string<charT, traits, allocator<charT> > to_string(charT zero = charT('0'), charT one = charT('1')) const;
-    template <class charT>
-        basic_string<charT, char_traits<charT>, allocator<charT> > to_string(charT zero = charT('0'), charT one =
-charT('1')) const; basic_string<char, char_traits<char>, allocator<char> > to_string(char zero = '0', char one = '1')
-const; size_t count() const noexcept; constexpr size_t size() const noexcept; bool operator==(const bitset& rhs) const
-noexcept; bool operator!=(const bitset& rhs) const noexcept; bool test(size_t pos) const; bool all() const noexcept;
-    bool any() const noexcept;
-    bool none() const noexcept;
-    bitset operator<<(size_t pos) const noexcept;
-    bitset operator>>(size_t pos) const noexcept;
-};
-
-// 23.3.5.3 bitset operators:
-template <size_t N>
-bitset<N> operator&(const bitset<N>&, const bitset<N>&) noexcept;
-
-template <size_t N>
-bitset<N> operator|(const bitset<N>&, const bitset<N>&) noexcept;
-
-template <size_t N>
-bitset<N> operator^(const bitset<N>&, const bitset<N>&) noexcept;
-
-template <class charT, class traits, size_t N>
-basic_istream<charT, traits>&
-operator>>(basic_istream<charT, traits>& is, bitset<N>& x);
-
-template <class charT, class traits, size_t N>
-basic_ostream<charT, traits>&
-operator<<(basic_ostream<charT, traits>& os, const bitset<N>& x);
-
-template <size_t N> struct hash<std::bitset<N>>;
-
-}  // std
-
-*/
-
-#include <__bit_reference>
-#include <__config>
-#include <__functional_base>
-#include <climits>
-#include <cstddef>
-#include <iosfwd>
-#include <stdexcept>
-#include <string>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-_CCCL_POP_MACROS
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <size_t _N_words, size_t _Size>
-class __bitset;
-
-template <size_t _N_words, size_t _Size>
-struct __has_storage_type<__bitset<_N_words, _Size>>
-{
-  static const bool value = true;
-};
-
-template <size_t _N_words, size_t _Size>
-class __bitset
-{
-public:
-  typedef ptrdiff_t difference_type;
-  typedef size_t size_type;
-  typedef size_type __storage_type;
-
-protected:
-  typedef __bitset __self;
-  typedef __storage_type* __storage_pointer;
-  typedef const __storage_type* __const_storage_pointer;
-  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-  friend class __bit_reference<__bitset>;
-  friend class __bit_const_reference<__bitset>;
-  friend class __bit_iterator<__bitset, false>;
-  friend class __bit_iterator<__bitset, true>;
-  friend struct __bit_array<__bitset>;
-
-  __storage_type __first_[_N_words];
-
-  typedef __bit_reference<__bitset> reference;
-  typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept;
-
-  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
-  {
-    return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
-  {
-    return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
-  {
-    return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
-  {
-    return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept;
-
-  void flip() noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
-  {
-    return to_ulong(integral_constant < bool, _Size<sizeof(unsigned long) * CHAR_BIT>());
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
-  {
-    return to_ullong(integral_constant < bool, _Size<sizeof(unsigned long long) * CHAR_BIT>());
-  }
-
-  bool all() const noexcept;
-  bool any() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept;
-
-private:
-  unsigned long to_ulong(false_type) const;
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong(true_type) const;
-  unsigned long long to_ullong(false_type) const;
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type) const;
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type, false_type) const;
-  unsigned long long to_ullong(true_type, true_type) const;
-};
-
-template <size_t _N_words, size_t _Size>
-inline constexpr __bitset<_N_words, _Size>::__bitset() noexcept
-    : __first_{0}
-{}
-
-template <size_t _N_words, size_t _Size>
-inline constexpr __bitset<_N_words, _Size>::__bitset(unsigned long long __v) noexcept
-#if __SIZEOF_SIZE_T__ == 8
-    : __first_{__v}
-#elif __SIZEOF_SIZE_T__ == 4
-    : __first_{static_cast<__storage_type>(__v),
-               _Size >= 2 * __bits_per_word
-                 ? static_cast<__storage_type>(__v >> __bits_per_word)
-                 : static_cast<__storage_type>(
-                     (__v >> __bits_per_word) & (__storage_type(1) << (_Size - __bits_per_word)) - 1)}
-#else
-#  error This constructor has not been ported to this platform
-#endif
-{}
-
-template <size_t _N_words, size_t _Size>
-inline void __bitset<_N_words, _Size>::operator&=(const __bitset& __v) noexcept
-{
-  for (size_type __i = 0; __i < _N_words; ++__i)
-  {
-    __first_[__i] &= __v.__first_[__i];
-  }
-}
-
-template <size_t _N_words, size_t _Size>
-inline void __bitset<_N_words, _Size>::operator|=(const __bitset& __v) noexcept
-{
-  for (size_type __i = 0; __i < _N_words; ++__i)
-  {
-    __first_[__i] |= __v.__first_[__i];
-  }
-}
-
-template <size_t _N_words, size_t _Size>
-inline void __bitset<_N_words, _Size>::operator^=(const __bitset& __v) noexcept
-{
-  for (size_type __i = 0; __i < _N_words; ++__i)
-  {
-    __first_[__i] ^= __v.__first_[__i];
-  }
-}
-
-template <size_t _N_words, size_t _Size>
-void __bitset<_N_words, _Size>::flip() noexcept
-{
-  // do middle whole words
-  size_type __n         = _Size;
-  __storage_pointer __p = __first_;
-  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-  {
-    *__p = ~*__p;
-  }
-  // do last partial word
-  if (__n > 0)
-  {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __storage_type __b = *__p & __m;
-    *__p &= ~__m;
-    *__p |= ~__b & __m;
-  }
-}
-
-template <size_t _N_words, size_t _Size>
-unsigned long __bitset<_N_words, _Size>::to_ulong(false_type) const
-{
-  const_iterator __e = __make_iter(_Size);
-  const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
-  if (__i != __e)
-  {
-    __throw_overflow_error("bitset to_ulong overflow error");
-  }
-
-  return __first_[0];
-}
-
-template <size_t _N_words, size_t _Size>
-inline unsigned long __bitset<_N_words, _Size>::to_ulong(true_type) const
-{
-  return __first_[0];
-}
-
-template <size_t _N_words, size_t _Size>
-unsigned long long __bitset<_N_words, _Size>::to_ullong(false_type) const
-{
-  const_iterator __e = __make_iter(_Size);
-  const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
-  if (__i != __e)
-  {
-    __throw_overflow_error("bitset to_ullong overflow error");
-  }
-
-  return to_ullong(true_type());
-}
-
-template <size_t _N_words, size_t _Size>
-inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type) const
-{
-  return to_ullong(true_type(), integral_constant<bool, sizeof(__storage_type) < sizeof(unsigned long long)>());
-}
-
-template <size_t _N_words, size_t _Size>
-inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, false_type) const
-{
-  return __first_[0];
-}
-
-template <size_t _N_words, size_t _Size>
-unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, true_type) const
-{
-  unsigned long long __r = __first_[0];
-  for (std::size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
-  {
-    __r |= static_cast<unsigned long long>(__first_[__i]) << (sizeof(__storage_type) * CHAR_BIT);
-  }
-  return __r;
-}
-
-template <size_t _N_words, size_t _Size>
-bool __bitset<_N_words, _Size>::all() const noexcept
-{
-  // do middle whole words
-  size_type __n               = _Size;
-  __const_storage_pointer __p = __first_;
-  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-  {
-    if (~*__p)
-    {
-      return false;
-    }
-  }
-  // do last partial word
-  if (__n > 0)
-  {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    if (~*__p & __m)
-    {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <size_t _N_words, size_t _Size>
-bool __bitset<_N_words, _Size>::any() const noexcept
-{
-  // do middle whole words
-  size_type __n               = _Size;
-  __const_storage_pointer __p = __first_;
-  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-  {
-    if (*__p)
-    {
-      return true;
-    }
-  }
-  // do last partial word
-  if (__n > 0)
-  {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    if (*__p & __m)
-    {
-      return true;
-    }
-  }
-  return false;
-}
-
-template <size_t _N_words, size_t _Size>
-inline size_t __bitset<_N_words, _Size>::__hash_code() const noexcept
-{
-  size_t __h = 0;
-  for (size_type __i = 0; __i < _N_words; ++__i)
-  {
-    __h ^= __first_[__i];
-  }
-  return __h;
-}
-
-template <size_t _Size>
-class __bitset<1, _Size>
-{
-public:
-  typedef ptrdiff_t difference_type;
-  typedef size_t size_type;
-  typedef size_type __storage_type;
-
-protected:
-  typedef __bitset __self;
-  typedef __storage_type* __storage_pointer;
-  typedef const __storage_type* __const_storage_pointer;
-  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-  friend class __bit_reference<__bitset>;
-  friend class __bit_const_reference<__bitset>;
-  friend class __bit_iterator<__bitset, false>;
-  friend class __bit_iterator<__bitset, true>;
-  friend struct __bit_array<__bitset>;
-
-  __storage_type __first_;
-
-  typedef __bit_reference<__bitset> reference;
-  typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept;
-
-  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
-  {
-    return reference(&__first_, __storage_type(1) << __pos);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
-  {
-    return const_reference(&__first_, __storage_type(1) << __pos);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
-  {
-    return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
-  {
-    return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept;
-
-  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept;
-
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const;
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const;
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept;
-
-  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept;
-};
-
-template <size_t _Size>
-inline constexpr __bitset<1, _Size>::__bitset() noexcept
-    : __first_(0)
-{}
-
-template <size_t _Size>
-inline constexpr __bitset<1, _Size>::__bitset(unsigned long long __v) noexcept
-    : __first_(_Size == __bits_per_word ? static_cast<__storage_type>(__v)
-                                        : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - 1))
-{}
-
-template <size_t _Size>
-inline void __bitset<1, _Size>::operator&=(const __bitset& __v) noexcept
-{
-  __first_ &= __v.__first_;
-}
-
-template <size_t _Size>
-inline void __bitset<1, _Size>::operator|=(const __bitset& __v) noexcept
-{
-  __first_ |= __v.__first_;
-}
-
-template <size_t _Size>
-inline void __bitset<1, _Size>::operator^=(const __bitset& __v) noexcept
-{
-  __first_ ^= __v.__first_;
-}
-
-template <size_t _Size>
-inline void __bitset<1, _Size>::flip() noexcept
-{
-  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-  __first_           = ~__first_;
-  __first_ &= __m;
-}
-
-template <size_t _Size>
-inline unsigned long __bitset<1, _Size>::to_ulong() const
-{
-  return __first_;
-}
-
-template <size_t _Size>
-inline unsigned long long __bitset<1, _Size>::to_ullong() const
-{
-  return __first_;
-}
-
-template <size_t _Size>
-inline bool __bitset<1, _Size>::all() const noexcept
-{
-  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-  return !(~__first_ & __m);
-}
-
-template <size_t _Size>
-inline bool __bitset<1, _Size>::any() const noexcept
-{
-  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-  return __first_ & __m;
-}
-
-template <size_t _Size>
-inline size_t __bitset<1, _Size>::__hash_code() const noexcept
-{
-  return __first_;
-}
-
-template <>
-class __bitset<0, 0>
-{
-public:
-  typedef ptrdiff_t difference_type;
-  typedef size_t size_type;
-  typedef size_type __storage_type;
-
-protected:
-  typedef __bitset __self;
-  typedef __storage_type* __storage_pointer;
-  typedef const __storage_type* __const_storage_pointer;
-  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-  friend class __bit_reference<__bitset>;
-  friend class __bit_const_reference<__bitset>;
-  friend class __bit_iterator<__bitset, false>;
-  friend class __bit_iterator<__bitset, true>;
-  friend struct __bit_array<__bitset>;
-
-  typedef __bit_reference<__bitset> reference;
-  typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long) noexcept;
-
-  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t) noexcept
-  {
-    return reference(0, 1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept
-  {
-    return const_reference(0, 1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t) noexcept
-  {
-    return iterator(0, 0);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t) const noexcept
-  {
-    return const_iterator(0, 0);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset&) noexcept {}
-  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset&) noexcept {}
-  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset&) noexcept {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
-  {
-    return 0;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
-  {
-    return 0;
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept
-  {
-    return true;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept
-  {
-    return false;
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
-  {
-    return 0;
-  }
-};
-
-inline constexpr __bitset<0, 0>::__bitset() noexcept {}
-
-inline constexpr __bitset<0, 0>::__bitset(unsigned long long) noexcept {}
-
-template <size_t _Size>
-class _LIBCUDACXX_TEMPLATE_VIS bitset;
-template <size_t _Size>
-struct hash<bitset<_Size>>;
-
-template <size_t _Size>
-class _LIBCUDACXX_TEMPLATE_VIS bitset
-    : private __bitset<_Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1, _Size>
-{
-public:
-  static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1;
-  typedef __bitset<__n_words, _Size> base;
-
-public:
-  typedef typename base::reference reference;
-  typedef typename base::const_reference const_reference;
-
-  // 23.3.5.1 constructors:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset(unsigned long long __v) noexcept
-      : base(__v)
-  {}
-  template <class _CharT, class = _EnableIf<_IsCharLikeType<_CharT>::value>>
-  explicit bitset(const _CharT* __str,
-                  typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos,
-                  _CharT __zero                                = _CharT('0'),
-                  _CharT __one                                 = _CharT('1'));
-  template <class _CharT, class _Traits, class _Allocator>
-  explicit bitset(const basic_string<_CharT, _Traits, _Allocator>& __str,
-                  typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0,
-                  typename basic_string<_CharT, _Traits, _Allocator>::size_type __n =
-                    (basic_string<_CharT, _Traits, _Allocator>::npos),
-                  _CharT __zero = _CharT('0'),
-                  _CharT __one  = _CharT('1'));
-
-  // 23.3.5.2 bitset operations:
-  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator&=(const bitset& __rhs) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator|=(const bitset& __rhs) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator^=(const bitset& __rhs) noexcept;
-  bitset& operator<<=(size_t __pos) noexcept;
-  bitset& operator>>=(size_t __pos) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bitset& set() noexcept;
-  bitset& set(size_t __pos, bool __val = true);
-  _LIBCUDACXX_INLINE_VISIBILITY bitset& reset() noexcept;
-  bitset& reset(size_t __pos);
-  _LIBCUDACXX_INLINE_VISIBILITY bitset operator~() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bitset& flip() noexcept;
-  bitset& flip(size_t __pos);
-
-  // element access:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference operator[](size_t __p) const
-  {
-    return base::__make_ref(__p);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY reference operator[](size_t __p)
-  {
-    return base::__make_ref(__p);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const;
-  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const;
-  template <class _CharT, class _Traits, class _Allocator>
-  basic_string<_CharT, _Traits, _Allocator> to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
-  template <class _CharT, class _Traits>
-  _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, _Traits, allocator<_CharT>>
-  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
-  template <class _CharT>
-  _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
-  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
-  _LIBCUDACXX_INLINE_VISIBILITY basic_string<char, char_traits<char>, allocator<char>>
-  to_string(char __zero = '0', char __one = '1') const;
-  _LIBCUDACXX_INLINE_VISIBILITY size_t count() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept
-  {
-    return _Size;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const bitset& __rhs) const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const bitset& __rhs) const noexcept;
-  bool test(size_t __pos) const;
-  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bool none() const noexcept
-  {
-    return !any();
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bitset operator<<(size_t __pos) const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY bitset operator>>(size_t __pos) const noexcept;
-
-private:
-  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
-  {
-    return base::__hash_code();
-  }
-
-  friend struct hash<bitset>;
-};
-
-template <size_t _Size>
-template <class _CharT, class>
-bitset<_Size>::bitset(const _CharT* __str, typename basic_string<_CharT>::size_type __n, _CharT __zero, _CharT __one)
-{
-  size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str));
-  for (size_t __i = 0; __i < __rlen; ++__i)
-  {
-    if (__str[__i] != __zero && __str[__i] != __one)
-    {
-      __throw_invalid_argument("bitset string ctor has invalid argument");
-    }
-  }
-
-  size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
-  size_t __i = 0;
-  for (; __i < _Mp; ++__i)
-  {
-    _CharT __c = __str[_Mp - 1 - __i];
-    if (__c == __zero)
-    {
-      (*this)[__i] = false;
-    }
-    else
-    {
-      (*this)[__i] = true;
-    }
-  }
-  _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
-}
-
-template <size_t _Size>
-template <class _CharT, class _Traits, class _Allocator>
-bitset<_Size>::bitset(
-  const basic_string<_CharT, _Traits, _Allocator>& __str,
-  typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos,
-  typename basic_string<_CharT, _Traits, _Allocator>::size_type __n,
-  _CharT __zero,
-  _CharT __one)
-{
-  if (__pos > __str.size())
-  {
-    _CUDA_VSTD::__throw_out_of_range("bitset string pos out of range");
-  }
-
-  size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos);
-  for (size_t __i = __pos; __i < __pos + __rlen; ++__i)
-  {
-    if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one))
-    {
-      __throw_invalid_argument("bitset string ctor has invalid argument");
-    }
-  }
-
-  size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
-  size_t __i = 0;
-  for (; __i < _Mp; ++__i)
-  {
-    _CharT __c = __str[__pos + _Mp - 1 - __i];
-    if (_Traits::eq(__c, __zero))
-    {
-      (*this)[__i] = false;
-    }
-    else
-    {
-      (*this)[__i] = true;
-    }
-  }
-  _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
-}
-
-template <size_t _Size>
-inline bitset<_Size>& bitset<_Size>::operator&=(const bitset& __rhs) noexcept
-{
-  base::operator&=(__rhs);
-  return *this;
-}
-
-template <size_t _Size>
-inline bitset<_Size>& bitset<_Size>::operator|=(const bitset& __rhs) noexcept
-{
-  base::operator|=(__rhs);
-  return *this;
-}
-
-template <size_t _Size>
-inline bitset<_Size>& bitset<_Size>::operator^=(const bitset& __rhs) noexcept
-{
-  base::operator^=(__rhs);
-  return *this;
-}
-
-template <size_t _Size>
-bitset<_Size>& bitset<_Size>::operator<<=(size_t __pos) noexcept
-{
-  __pos = _CUDA_VSTD::min(__pos, _Size);
-  _CUDA_VSTD::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size));
-  _CUDA_VSTD::fill_n(base::__make_iter(0), __pos, false);
-  return *this;
-}
-
-template <size_t _Size>
-bitset<_Size>& bitset<_Size>::operator>>=(size_t __pos) noexcept
-{
-  __pos = _CUDA_VSTD::min(__pos, _Size);
-  _CUDA_VSTD::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0));
-  _CUDA_VSTD::fill_n(base::__make_iter(_Size - __pos), __pos, false);
-  return *this;
-}
-
-template <size_t _Size>
-inline bitset<_Size>& bitset<_Size>::set() noexcept
-{
-  _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true);
-  return *this;
-}
-
-template <size_t _Size>
-bitset<_Size>& bitset<_Size>::set(size_t __pos, bool __val)
-{
-  if (__pos >= _Size)
-  {
-    _CUDA_VSTD::__throw_out_of_range("bitset set argument out of range");
-  }
-
-  (*this)[__pos] = __val;
-  return *this;
-}
-
-template <size_t _Size>
-inline bitset<_Size>& bitset<_Size>::reset() noexcept
-{
-  _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false);
-  return *this;
-}
-
-template <size_t _Size>
-bitset<_Size>& bitset<_Size>::reset(size_t __pos)
-{
-  if (__pos >= _Size)
-  {
-    _CUDA_VSTD::__throw_out_of_range("bitset reset argument out of range");
-  }
-
-  (*this)[__pos] = false;
-  return *this;
-}
-
-template <size_t _Size>
-inline bitset<_Size> bitset<_Size>::operator~() const noexcept
-{
-  bitset __x(*this);
-  __x.flip();
-  return __x;
-}
-
-template <size_t _Size>
-inline bitset<_Size>& bitset<_Size>::flip() noexcept
-{
-  base::flip();
-  return *this;
-}
-
-template <size_t _Size>
-bitset<_Size>& bitset<_Size>::flip(size_t __pos)
-{
-  if (__pos >= _Size)
-  {
-    _CUDA_VSTD::__throw_out_of_range("bitset flip argument out of range");
-  }
-
-  reference r = base::__make_ref(__pos);
-  r           = ~r;
-  return *this;
-}
-
-template <size_t _Size>
-inline unsigned long bitset<_Size>::to_ulong() const
-{
-  return base::to_ulong();
-}
-
-template <size_t _Size>
-inline unsigned long long bitset<_Size>::to_ullong() const
-{
-  return base::to_ullong();
-}
-
-template <size_t _Size>
-template <class _CharT, class _Traits, class _Allocator>
-basic_string<_CharT, _Traits, _Allocator> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
-{
-  basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
-  for (size_t __i = 0; __i < _Size; ++__i)
-  {
-    if ((*this)[__i])
-    {
-      __r[_Size - 1 - __i] = __one;
-    }
-  }
-  return __r;
-}
-
-template <size_t _Size>
-template <class _CharT, class _Traits>
-inline basic_string<_CharT, _Traits, allocator<_CharT>> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
-{
-  return to_string<_CharT, _Traits, allocator<_CharT>>(__zero, __one);
-}
-
-template <size_t _Size>
-template <class _CharT>
-inline basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
-bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
-{
-  return to_string<_CharT, char_traits<_CharT>, allocator<_CharT>>(__zero, __one);
-}
-
-template <size_t _Size>
-inline basic_string<char, char_traits<char>, allocator<char>> bitset<_Size>::to_string(char __zero, char __one) const
-{
-  return to_string<char, char_traits<char>, allocator<char>>(__zero, __one);
-}
-
-template <size_t _Size>
-inline size_t bitset<_Size>::count() const noexcept
-{
-  return static_cast<size_t>(__count_bool_true(base::__make_iter(0), _Size));
-}
-
-template <size_t _Size>
-inline bool bitset<_Size>::operator==(const bitset& __rhs) const noexcept
-{
-  return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0));
-}
-
-template <size_t _Size>
-inline bool bitset<_Size>::operator!=(const bitset& __rhs) const noexcept
-{
-  return !(*this == __rhs);
-}
-
-template <size_t _Size>
-bool bitset<_Size>::test(size_t __pos) const
-{
-  if (__pos >= _Size)
-  {
-    _CUDA_VSTD::__throw_out_of_range("bitset test argument out of range");
-  }
-
-  return (*this)[__pos];
-}
-
-template <size_t _Size>
-inline bool bitset<_Size>::all() const noexcept
-{
-  return base::all();
-}
-
-template <size_t _Size>
-inline bool bitset<_Size>::any() const noexcept
-{
-  return base::any();
-}
-
-template <size_t _Size>
-inline bitset<_Size> bitset<_Size>::operator<<(size_t __pos) const noexcept
-{
-  bitset __r = *this;
-  __r <<= __pos;
-  return __r;
-}
-
-template <size_t _Size>
-inline bitset<_Size> bitset<_Size>::operator>>(size_t __pos) const noexcept
-{
-  bitset __r = *this;
-  __r >>= __pos;
-  return __r;
-}
-
-template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
-{
-  bitset<_Size> __r = __x;
-  __r &= __y;
-  return __r;
-}
-
-template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
-{
-  bitset<_Size> __r = __x;
-  __r |= __y;
-  return __r;
-}
-
-template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
-{
-  bitset<_Size> __r = __x;
-  __r ^= __y;
-  return __r;
-}
-
-template <size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<bitset<_Size>> : public __unary_function<bitset<_Size>, size_t>
-{
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const bitset<_Size>& __bs) const noexcept
-  {
-    return __bs.__hash_code();
-  }
-};
-
-template <class _CharT, class _Traits, size_t _Size>
-basic_istream<_CharT, _Traits>& operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x);
-
-template <class _CharT, class _Traits, size_t _Size>
-basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x);
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-_CCCL_POP_MACROS
-
-#endif // _LIBCUDACXX_BITSET
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef b/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef
index 73d0f12b90..749931900f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef
@@ -57,7 +57,7 @@ using ::ptrdiff_t;
 using ::size_t;
 
 #if defined(__CLANG_MAX_ALIGN_T_DEFINED) || defined(_GCC_MAX_ALIGN_T) || defined(__DEFINED_max_align_t) \
-  || defined(__NetBSD__)
+  || defined(__NetBS)
 // Re-use the compiler's <stddef.h> max_align_t where possible.
 using ::max_align_t;
 #else
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
new file mode 100644
index 0000000000..2e5364ab2c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
@@ -0,0 +1,159 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// template <class charT>
+//     explicit bitset(const charT* str,
+//                     typename basic_string_view<charT>::size_type n = basic_string_view<charT>::npos, //
+//                     s/string/string_view since C++26 charT zero = charT('0'), charT one = charT('1')); // constexpr
+//                     since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+// #include <cuda/std/algorithm> // for 'min' and 'max'
+// #include <cuda/std/stdexcept> // for 'invalid_argument'
+
+#include "test_macros.h"
+
+// TEST_MSVC_DIAGNOSTIC_IGNORED(6294) // Ill-defined for-loop:  initial condition does not satisfy test.  Loop body not
+// executed.
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+template <cuda::std::size_t N>
+void test_char_pointer_ctor_throw()
+{
+  try
+  {
+    cuda::std::bitset<N> v("xxx1010101010xxxx");
+    assert(false);
+  }
+  catch (std::invalid_argument&)
+  {}
+}
+
+void test_exceptions()
+{
+  test_char_pointer_ctor_throw<0>();
+  test_char_pointer_ctor_throw<1>();
+  test_char_pointer_ctor_throw<31>();
+  test_char_pointer_ctor_throw<32>();
+  test_char_pointer_ctor_throw<33>();
+  test_char_pointer_ctor_throw<63>();
+  test_char_pointer_ctor_throw<64>();
+  test_char_pointer_ctor_throw<65>();
+  test_char_pointer_ctor_throw<1000>();
+}
+#endif
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_char_pointer_ctor()
+{
+  static_assert(!cuda::std::is_convertible<const char*, cuda::std::bitset<N>>::value, "");
+  static_assert(cuda::std::is_constructible<cuda::std::bitset<N>, const char*>::value, "");
+  {
+    const char s[] = "1010101010";
+    cuda::std::bitset<N> v(s);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[M - 1 - i] == '1'));
+    }
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v[i] == false);
+      }
+    }
+  }
+  {
+    const char s[] = "1010101010";
+    cuda::std::bitset<N> v(s, 10);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[M - 1 - i] == '1'));
+    }
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v[i] == false);
+      }
+    }
+  }
+  {
+    const char s[] = "1a1a1a1a1a";
+    cuda::std::bitset<N> v(s, 10, 'a');
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[M - 1 - i] == '1'));
+    }
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v[i] == false);
+      }
+    }
+  }
+  {
+    const char s[] = "bababababa";
+    cuda::std::bitset<N> v(s, 10, 'a', 'b');
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[M - 1 - i] == 'b'));
+    }
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v[i] == false);
+      }
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_char_pointer_ctor<0>();
+  test_char_pointer_ctor<1>();
+  test_char_pointer_ctor<31>();
+  test_char_pointer_ctor<32>();
+  test_char_pointer_ctor<33>();
+  test_char_pointer_ctor<63>();
+  test_char_pointer_ctor<64>();
+  test_char_pointer_ctor<65>();
+  test_char_pointer_ctor<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif
+
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp
new file mode 100644
index 0000000000..8988d271c0
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/default.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// default ctor
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+
+#include "test_macros.h"
+
+TEST_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_default_ctor()
+{
+  {
+    TEST_CONSTEXPR cuda::std::bitset<N> v1;
+    assert(v1.size() == N);
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v1[i] == false);
+      }
+    }
+  }
+#if TEST_STD_VER >= 11
+  {
+    constexpr cuda::std::bitset<N> v1;
+    static_assert(v1.size() == N, "");
+  }
+#endif
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_default_ctor<0>();
+  test_default_ctor<1>();
+  test_default_ctor<31>();
+  test_default_ctor<32>();
+  test_default_ctor<33>();
+  test_default_ctor<63>();
+  test_default_ctor<64>();
+  test_default_ctor<65>();
+  test_default_ctor<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp
new file mode 100644
index 0000000000..4501345c57
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_ctor.pass.cpp
@@ -0,0 +1,196 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset(string, pos, n, zero, one); // constexpr since C++23
+
+#include <cuda/std/version>
+
+#ifndef _LIBCUDACXX_HAS_STRING
+int main(int, char**)
+{
+  return 0;
+}
+#else
+
+#  include <cuda/std/algorithm> // for 'min' and 'max'
+#  include <cuda/std/bitset>
+#  include <cuda/std/cassert>
+#  include <cuda/std/stdexcept> // for 'invalid_argument'
+#  include <cuda/std/string>
+#  include <cuda/std/type_traits>
+
+#  include "test_macros.h"
+
+template <cuda::std::size_t N>
+TEST_CONSTEXPR_CXX14 void test_string_ctor()
+{
+#  ifndef TEST_HAS_NO_EXCEPTIONS
+  if (!TEST_IS_CONSTANT_EVALUATED)
+  {
+    try
+    {
+      cuda::std::string s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, s.size() + 1);
+      assert(false);
+    }
+    catch (cuda::std::out_of_range&)
+    {}
+    try
+    {
+      cuda::std::string s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, s.size() + 1, 10);
+      assert(false);
+    }
+    catch (cuda::std::out_of_range&)
+    {}
+    try
+    {
+      cuda::std::string s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s);
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+    try
+    {
+      cuda::std::string s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, 2);
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+    try
+    {
+      cuda::std::string s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, 2, 10);
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+    try
+    {
+      cuda::std::string s("xxxbababababaxxxx");
+      cuda::std::bitset<N> v(s, 2, 10, 'a', 'b');
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+  }
+#  endif // TEST_HAS_NO_EXCEPTIONS
+
+  static_assert(!cuda::std::is_convertible<cuda::std::string, cuda::std::bitset<N>>::value, "");
+  static_assert(cuda::std::is_constructible<cuda::std::bitset<N>, cuda::std::string>::value, "");
+  {
+    cuda::std::string s("1010101010");
+    cuda::std::bitset<N> v(s);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string s("xxx1010101010");
+    cuda::std::bitset<N> v(s, 3);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string s("xxx1010101010xxxx");
+    cuda::std::bitset<N> v(s, 3, 10);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string s("xxx1a1a1a1a1axxxx");
+    cuda::std::bitset<N> v(s, 3, 10, 'a');
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string s("xxxbababababaxxxx");
+    cuda::std::bitset<N> v(s, 3, 10, 'a', 'b');
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == 'b'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+}
+
+struct Nonsense
+{
+  virtual ~Nonsense() {}
+};
+
+TEST_CONSTEXPR_CXX14 void test_for_non_eager_instantiation()
+{
+  // Ensure we don't accidentally instantiate `cuda::std::basic_string<Nonsense>`
+  // since it may not be well formed and can cause an error in the
+  // non-immediate context.
+  static_assert(!cuda::std::is_constructible<cuda::std::bitset<3>, Nonsense*>::value, "");
+  static_assert(
+    !cuda::std::is_constructible<cuda::std::bitset<3>, Nonsense*, cuda::std::size_t, Nonsense&, Nonsense&>::value, "");
+}
+
+TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_string_ctor<0>();
+  test_string_ctor<1>();
+  test_string_ctor<31>();
+  test_string_ctor<32>();
+  test_string_ctor<33>();
+  test_string_ctor<63>();
+  test_string_ctor<64>();
+  test_string_ctor<65>();
+  test_string_ctor<1000>();
+  test_for_non_eager_instantiation();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#  if TEST_STD_VER >= 2023
+  static_assert(test(), "");
+#  endif
+
+  return 0;
+}
+
+#endif
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp
new file mode 100644
index 0000000000..060e1ead7a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/string_view_ctor.pass.cpp
@@ -0,0 +1,201 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+//    template<class charT, class traits>
+//        explicit bitset(
+//            const basic_string_view<charT,traits>& str,
+//            typename basic_string_view<charT,traits>::size_type pos = 0,
+//            typename basic_string_view<charT,traits>::size_type n = basic_string_view<charT,traits>::npos,
+//            charT zero = charT('0'), charT one = charT('1'));
+
+#include <cuda/std/version>
+
+#ifndef _LIBCUDACXX_HAS_STRING_VIEW
+int main(int, char**)
+{
+  return 0;
+}
+#else
+
+#  include <cuda/std/algorithm> // for 'min' and 'max'
+#  include <cuda/std/bitset>
+#  include <cuda/std/cassert>
+#  include <cuda/std/stdexcept> // for 'invalid_argument'
+#  include <cuda/std/string_view>
+#  include <cuda/std/type_traits>
+
+#  include "test_macros.h"
+
+template <cuda::std::size_t N>
+constexpr void test_string_ctor()
+{
+#  ifndef TEST_HAS_NO_EXCEPTIONS
+  if (!TEST_IS_CONSTANT_EVALUATED)
+  {
+    try
+    {
+      cuda::std::string_view s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, s.size() + 1);
+      assert(false);
+    }
+    catch (cuda::std::out_of_range&)
+    {}
+    try
+    {
+      cuda::std::string_view s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, s.size() + 1, 10);
+      assert(false);
+    }
+    catch (cuda::std::out_of_range&)
+    {}
+    try
+    {
+      cuda::std::string_view s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s);
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+    try
+    {
+      cuda::std::string_view s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, 2);
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+    try
+    {
+      cuda::std::string_view s("xxx1010101010xxxx");
+      cuda::std::bitset<N> v(s, 2, 10);
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+    try
+    {
+      cuda::std::string_view s("xxxbababababaxxxx");
+      cuda::std::bitset<N> v(s, 2, 10, 'a', 'b');
+      assert(false);
+    }
+    catch (cuda::std::invalid_argument&)
+    {}
+  }
+#  endif // TEST_HAS_NO_EXCEPTIONS
+
+  static_assert(!cuda::std::is_convertible_v<cuda::std::string_view, cuda::std::bitset<N>>, "");
+  static_assert(cuda::std::is_constructible_v<cuda::std::bitset<N>, cuda::std::string_view>, "");
+  {
+    cuda::std::string_view s("1010101010");
+    cuda::std::bitset<N> v(s);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string_view s("xxx1010101010");
+    cuda::std::bitset<N> v(s, 3);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string_view s("xxx1010101010xxxx");
+    cuda::std::bitset<N> v(s, 3, 10);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string_view s("xxx1a1a1a1a1axxxx");
+    cuda::std::bitset<N> v(s, 3, 10, 'a');
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == '1'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+  {
+    cuda::std::string_view s("xxxbababababaxxxx");
+    cuda::std::bitset<N> v(s, 3, 10, 'a', 'b');
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 10);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == (s[3 + M - 1 - i] == 'b'));
+    }
+    for (cuda::std::size_t i = 10; i < v.size(); ++i)
+    {
+      assert(v[i] == false);
+    }
+  }
+}
+
+struct Nonsense
+{
+  virtual ~Nonsense() {}
+};
+
+constexpr void test_for_non_eager_instantiation()
+{
+  // Ensure we don't accidentally instantiate `cuda::std::basic_string_view<Nonsense>`
+  // since it may not be well formed and can cause an error in the
+  // non-immediate context.
+  static_assert(!cuda::std::is_constructible<cuda::std::bitset<3>, Nonsense*>::value, "");
+  static_assert(
+    !cuda::std::is_constructible<cuda::std::bitset<3>, Nonsense*, cuda::std::size_t, Nonsense&, Nonsense&>::value, "");
+}
+
+constexpr bool test()
+{
+  test_string_ctor<0>();
+  test_string_ctor<1>();
+  test_string_ctor<31>();
+  test_string_ctor<32>();
+  test_string_ctor<33>();
+  test_string_ctor<63>();
+  test_string_ctor<64>();
+  test_string_ctor<65>();
+  test_string_ctor<1000>();
+  test_for_non_eager_instantiation();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  static_assert(test(), "");
+
+  return 0;
+}
+
+#endif
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp
new file mode 100644
index 0000000000..cbe955b61d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.cons/ull_ctor.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset(unsigned long long val); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+// #include <cuda/std/algorithm> // for 'min' and 'max'
+#include <cuda/std/cstddef>
+
+#include "test_macros.h"
+
+// TEST_MSVC_DIAGNOSTIC_IGNORED(6294) // Ill-defined for-loop:  initial condition does not satisfy test.  Loop body not
+// executed.
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_val_ctor()
+{
+  {
+    TEST_CONSTEXPR cuda::std::bitset<N> v(0xAAAAAAAAAAAAAAAAULL);
+    assert(v.size() == N);
+    cuda::std::size_t M = cuda::std::min<cuda::std::size_t>(v.size(), 64);
+    for (cuda::std::size_t i = 0; i < M; ++i)
+    {
+      assert(v[i] == ((i & 1) != 0));
+    }
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = M; i < v.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v[i] == false);
+      }
+    }
+  }
+  {
+    constexpr cuda::std::bitset<N> v(0xAAAAAAAAAAAAAAAAULL);
+    static_assert(v.size() == N, "");
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_val_ctor<0>();
+  test_val_ctor<1>();
+  test_val_ctor<31>();
+  test_val_ctor<32>();
+  test_val_ctor<33>();
+  test_val_ctor<63>();
+  test_val_ctor<64>();
+  test_val_ctor<65>();
+  test_val_ctor<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp
new file mode 100644
index 0000000000..76bc4de94d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/all.pass.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bool all() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_all()
+{
+  cuda::std::bitset<N> v;
+  v.reset();
+  assert(v.all() == (N == 0));
+  v.set();
+  assert(v.all() == true);
+  if (v.size() > 1)
+  {
+    v[N / 2] = false;
+    assert(v.all() == false);
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_all<0>();
+  test_all<1>();
+  test_all<31>();
+  test_all<32>();
+  test_all<33>();
+  test_all<63>();
+  test_all<64>();
+  test_all<65>();
+  test_all<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp
new file mode 100644
index 0000000000..f4549ae6ac
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/any.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bool any() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_any()
+{
+  cuda::std::bitset<N> v;
+  v.reset();
+  assert(v.any() == false);
+  v.set();
+  assert(v.any() == (N != 0));
+  if (v.size() > 1)
+  {
+    v[N / 2] = false;
+    assert(v.any() == true);
+    v.reset();
+    v[N / 2] = true;
+    assert(v.any() == true);
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_any<0>();
+  test_any<1>();
+  test_any<31>();
+  test_any<32>();
+  test_any<33>();
+  test_any<63>();
+  test_any<64>();
+  test_any<65>();
+  test_any<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp
new file mode 100644
index 0000000000..233b40fa98
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/count.pass.cpp
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// size_t count() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_count()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    const cuda::std::bitset<N> v(cases[c]);
+    cuda::std::size_t c1 = v.count();
+    cuda::std::size_t c2 = 0;
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 0; i < v.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        if (v[i])
+        {
+          ++c2;
+        }
+      }
+    }
+    assert(c1 == c2);
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_count<0>();
+  test_count<1>();
+  test_count<31>();
+  test_count<32>();
+  test_count<33>();
+  test_count<63>();
+  test_count<64>();
+  test_count<65>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_count<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp
new file mode 100644
index 0000000000..b9f9f2b897
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_all.pass.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& flip(); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_flip_all()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> v1(cases[c]);
+    cuda::std::bitset<N> v2 = v1;
+    v2.flip();
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v2[i] == ~v1[i]);
+      }
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_flip_all<0>();
+  test_flip_all<1>();
+  test_flip_all<31>();
+  test_flip_all<32>();
+  test_flip_all<33>();
+  test_flip_all<63>();
+  test_flip_all<64>();
+  test_flip_all<65>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_flip_all<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp
new file mode 100644
index 0000000000..31b2a5995b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.out_of_range.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-no-exceptions
+
+// bitset<N>& flip(size_t pos); // constexpr since C++23
+
+// Make sure we throw ::std::out_of_range when calling flip() on an OOB index.
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+
+int main(int, char**)
+{
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    {
+      cuda::std::bitset<0> v;
+      try
+      {
+        v.flip(0);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<1> v("0");
+      try
+      {
+        v.flip(2);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<10> v("0000000000");
+      try
+      {
+        v.flip(10);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    })
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp
new file mode 100644
index 0000000000..2d6ebd3eea
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/flip_one.pass.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& flip(size_t pos); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_flip_one()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> v(cases[c]);
+    if (v.size() > 0)
+    {
+      cuda::std::size_t middle = v.size() / 2;
+      v.flip(middle);
+      bool b = v[middle];
+      assert(v[middle] == b);
+      v.flip(middle);
+      assert(v[middle] != b);
+      v.flip(middle);
+      assert(v[middle] == b);
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_flip_one<0>();
+  test_flip_one<1>();
+  test_flip_one<31>();
+  test_flip_one<32>();
+  test_flip_one<33>();
+  test_flip_one<63>();
+  test_flip_one<64>();
+  test_flip_one<65>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_flip_one<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp
new file mode 100644
index 0000000000..a34fbe8e11
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index.pass.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>::reference operator[](size_t pos); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_index()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> v1(cases[c]);
+    if (v1.size() > 0)
+    {
+      assert(v1[N / 2] == v1.test(N / 2));
+      typename cuda::std::bitset<N>::reference r = v1[N / 2];
+      assert(r == v1.test(N / 2));
+      typename cuda::std::bitset<N>::reference r2 = v1[N / 2];
+      r                                           = r2;
+      assert(r == v1.test(N / 2));
+      r = false;
+      assert(r == false);
+      assert(v1.test(N / 2) == false);
+      r = true;
+      assert(r == true);
+      assert(v1.test(N / 2) == true);
+      bool b = ~r;
+      assert(r == true);
+      assert(v1.test(N / 2) == true);
+      assert(b == false);
+      r.flip();
+      assert(r == false);
+      assert(v1.test(N / 2) == false);
+    }
+    ASSERT_SAME_TYPE(decltype(v1[0]), typename cuda::std::bitset<N>::reference);
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_index<0>();
+  test_index<1>();
+  test_index<31>();
+  test_index<32>();
+  test_index<33>();
+  test_index<63>();
+  test_index<64>();
+  test_index<65>();
+
+  cuda::std::bitset<1> set;
+  set[0] = false;
+  auto b = set[0];
+  set[0] = true;
+  assert(b);
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_index<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp
new file mode 100644
index 0000000000..eacfa4e54b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/index_const.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// constexpr bool operator[](size_t pos) const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_index_const()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> const v(cases[c]);
+    if (v.size() > 0)
+    {
+      assert(v[N / 2] == v.test(N / 2));
+    }
+#if !defined(_LIBCUDACXX_VERSION) || defined(_LIBCUDACXX_ABI_BITSET_span_BOOL_CONST_SUBSCRIPT_RETURN_BOOL)
+    ASSERT_SAME_TYPE(decltype(v[0]), bool);
+#else
+    ASSERT_SAME_TYPE(decltype(v[0]), typename cuda::std::bitset<N>::const_reference);
+#endif
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_index_const<0>();
+  test_index_const<1>();
+  test_index_const<31>();
+  test_index_const<32>();
+  test_index_const<33>();
+  test_index_const<63>();
+  test_index_const<64>();
+  test_index_const<65>();
+
+  cuda::std::bitset<1> set_;
+  set_[0]         = false;
+  const auto& set = set_;
+  auto b          = set[0];
+  set_[0]         = true;
+#if !defined(_LIBCUDACXX_VERSION) || defined(_LIBCUDACXX_ABI_BITSET_span_BOOL_CONST_SUBSCRIPT_RETURN_BOOL)
+  assert(!b);
+#else
+  assert(b);
+#endif
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_index_const<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp
new file mode 100644
index 0000000000..ceecfab0a1
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// The CI "Apple back-deployment with assertions enabled" needs a higher value
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12712420
+
+// bitset<N> operator<<(size_t pos) const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N, cuda::std::size_t Start = 0, cuda::std::size_t End = static_cast<cuda::std::size_t>(-1)>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_left_shift()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  if (Start == 9)
+  {
+    assert(End >= cases.size());
+  }
+  for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c)
+  {
+    for (cuda::std::size_t s = 0; s <= N + 1; ++s)
+    {
+      cuda::std::bitset<N> v1(cases[c]);
+      cuda::std::bitset<N> v2 = v1;
+      assert((v1 <<= s) == (v2 << s));
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test_left_shift<0>();
+  test_left_shift<1>();
+  test_left_shift<31>();
+  test_left_shift<32>();
+  test_left_shift<33>();
+  test_left_shift<63>();
+  test_left_shift<64>();
+  test_left_shift<65>();
+  test_left_shift<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_left_shift<0>(), "");
+  static_assert(test_left_shift<1>(), "");
+  static_assert(test_left_shift<31>(), "");
+  static_assert(test_left_shift<32>(), "");
+  static_assert(test_left_shift<33>(), "");
+  static_assert(test_left_shift<63, 0, 6>(), "");
+  static_assert(test_left_shift<63, 6>(), "");
+  static_assert(test_left_shift<64, 0, 6>(), "");
+  static_assert(test_left_shift<64, 6>(), "");
+  static_assert(test_left_shift<65, 0, 3>(), "");
+  static_assert(test_left_shift<65, 3, 6>(), "");
+  static_assert(test_left_shift<65, 6, 9>(), "");
+  static_assert(test_left_shift<65, 9>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp
new file mode 100644
index 0000000000..8572c1376c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& operator<<=(size_t pos); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N, cuda::std::size_t Start = 0, cuda::std::size_t End = static_cast<cuda::std::size_t>(-1)>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_left_shift()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  if (Start >= 9)
+  {
+    assert(End >= cases.size());
+  }
+  for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c)
+  {
+    for (cuda::std::size_t s = 0; s <= N + 1; ++s)
+    {
+      cuda::std::bitset<N> v1(cases[c]);
+      cuda::std::bitset<N> v2 = v1;
+      v1 <<= s;
+      _CCCL_DIAG_PUSH
+      _CCCL_DIAG_SUPPRESS_ICC(186)
+      for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+      {
+        _CCCL_DIAG_POP
+        {
+          if (i < s)
+          {
+            assert(v1[i] == 0);
+          }
+          else
+          {
+            assert(v1[i] == v2[i - s]);
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+int main(int, char**)
+{
+  test_left_shift<0>();
+  test_left_shift<1>();
+  test_left_shift<31>();
+  test_left_shift<32>();
+  test_left_shift<33>();
+  test_left_shift<63>();
+  test_left_shift<64>();
+  test_left_shift<65>();
+  test_left_shift<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_left_shift<0>(), "");
+  static_assert(test_left_shift<1>(), "");
+  static_assert(test_left_shift<31>(), "");
+  static_assert(test_left_shift<32>(), "");
+  static_assert(test_left_shift<33>(), "");
+  static_assert(test_left_shift<63, 0, 3>(), "");
+  static_assert(test_left_shift<63, 3, 6>(), "");
+  static_assert(test_left_shift<63, 6, 9>(), "");
+  static_assert(test_left_shift<63, 9>(), "");
+  static_assert(test_left_shift<64, 0, 3>(), "");
+  static_assert(test_left_shift<64, 3, 6>(), "");
+  static_assert(test_left_shift<64, 6, 9>(), "");
+  static_assert(test_left_shift<64, 9>(), "");
+  static_assert(test_left_shift<65, 0, 3>(), "");
+  static_assert(test_left_shift<65, 3, 6>(), "");
+  static_assert(test_left_shift<65, 6, 9>(), "");
+  static_assert(test_left_shift<65, 9>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp
new file mode 100644
index 0000000000..4de9bf340e
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/none.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bool none() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_none()
+{
+  cuda::std::bitset<N> v;
+  v.reset();
+  assert(v.none() == true);
+  v.set();
+  assert(v.none() == (N == 0));
+  if (v.size() > 1)
+  {
+    v[N / 2] = false;
+    assert(v.none() == false);
+    v.reset();
+    v[N / 2] = true;
+    assert(v.none() == false);
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_none<0>();
+  test_none<1>();
+  test_none<31>();
+  test_none<32>();
+  test_none<33>();
+  test_none<63>();
+  test_none<64>();
+  test_none<65>();
+  test_none<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp
new file mode 100644
index 0000000000..5a268391a2
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/not_all.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N> operator~() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_not_all()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> v1(cases[c]);
+    cuda::std::bitset<N> v2 = ~v1;
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_ICC(186)
+    for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+    {
+      _CCCL_DIAG_POP
+      {
+        assert(v2[i] == ~v1[i]);
+      }
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_not_all<0>();
+  test_not_all<1>();
+  test_not_all<31>();
+  test_not_all<32>();
+  test_not_all<33>();
+  test_not_all<63>();
+  test_not_all<64>();
+  test_not_all<65>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_not_all<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp
new file mode 100644
index 0000000000..edb4f4512a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& operator&=(const bitset<N>& rhs); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_and_eq()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1)
+  {
+    for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2)
+    {
+      cuda::std::bitset<N> v1(cases[c1]);
+      cuda::std::bitset<N> v2(cases[c2]);
+      cuda::std::bitset<N> v3 = v1;
+      v1 &= v2;
+      _CCCL_DIAG_PUSH
+      _CCCL_DIAG_SUPPRESS_ICC(186)
+      for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+      {
+        _CCCL_DIAG_POP
+        {
+          assert(v1[i] == (v3[i] && v2[i]));
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test_op_and_eq<0>();
+  test_op_and_eq<1>();
+  test_op_and_eq<31>();
+  test_op_and_eq<32>();
+  test_op_and_eq<33>();
+  test_op_and_eq<63>();
+  test_op_and_eq<64>();
+  test_op_and_eq<65>();
+  test_op_and_eq<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_op_and_eq<0>(), "");
+  static_assert(test_op_and_eq<1>(), "");
+  static_assert(test_op_and_eq<31>(), "");
+  static_assert(test_op_and_eq<32>(), "");
+  static_assert(test_op_and_eq<33>(), "");
+  static_assert(test_op_and_eq<63>(), "");
+  static_assert(test_op_and_eq<64>(), "");
+  static_assert(test_op_and_eq<65>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp
new file mode 100644
index 0000000000..4eb828b040
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_eq_eq.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// test:
+
+// bool operator==(const bitset<N>& rhs) const; // constexpr since C++23
+// bool operator!=(const bitset<N>& rhs) const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_equality()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> const v1(cases[c]);
+    cuda::std::bitset<N> v2 = v1;
+    assert(v1 == v2);
+    if (v1.size() > 0)
+    {
+      v2[N / 2].flip();
+      assert(v1 != v2);
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_equality<0>();
+  test_equality<1>();
+  test_equality<31>();
+  test_equality<32>();
+  test_equality<33>();
+  test_equality<63>();
+  test_equality<64>();
+  test_equality<65>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_equality<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
new file mode 100644
index 0000000000..995aed4c7a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000
+
+// bitset<N>& operator|=(const bitset<N>& rhs); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N, cuda::std::size_t Start = 0, cuda::std::size_t End = static_cast<cuda::std::size_t>(-1)>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_or_eq()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  if (Start != 0)
+  {
+    assert(End >= cases.size());
+  }
+  for (cuda::std::size_t c1 = Start; c1 != cases.size() && c1 != End; ++c1)
+  {
+    for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2)
+    {
+      cuda::std::bitset<N> v1(cases[c1]);
+      cuda::std::bitset<N> v2(cases[c2]);
+      cuda::std::bitset<N> v3 = v1;
+      v1 |= v2;
+      _CCCL_DIAG_PUSH
+      _CCCL_DIAG_SUPPRESS_ICC(186)
+      for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+      {
+        _CCCL_DIAG_POP
+        {
+          assert(v1[i] == (v3[i] || v2[i]));
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test_op_or_eq<0>();
+  test_op_or_eq<1>();
+  test_op_or_eq<31>();
+  test_op_or_eq<32>();
+  test_op_or_eq<33>();
+  test_op_or_eq<63>();
+  test_op_or_eq<64>();
+  test_op_or_eq<65>();
+  test_op_or_eq<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_op_or_eq<0>(), "");
+  static_assert(test_op_or_eq<1>(), "");
+  static_assert(test_op_or_eq<31>(), "");
+  static_assert(test_op_or_eq<32>(), "");
+  static_assert(test_op_or_eq<33>(), "");
+  static_assert(test_op_or_eq<63>(), "");
+  static_assert(test_op_or_eq<64>(), "");
+  static_assert(test_op_or_eq<65, 0, 6>(), "");
+  static_assert(test_op_or_eq<65, 6>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp
new file mode 100644
index 0000000000..5db92124e8
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/op_xor_eq.pass.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& operator^=(const bitset<N>& rhs); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N, cuda::std::size_t Start = 0, cuda::std::size_t End = static_cast<cuda::std::size_t>(-1)>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_xor_eq()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  if (Start >= 9)
+  {
+    assert(End >= cases.size());
+  }
+  for (cuda::std::size_t c1 = Start; c1 != cases.size() && c1 != End; ++c1)
+  {
+    for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2)
+    {
+      cuda::std::bitset<N> v1(cases[c1]);
+      cuda::std::bitset<N> v2(cases[c2]);
+      cuda::std::bitset<N> v3 = v1;
+      v1 ^= v2;
+      _CCCL_DIAG_PUSH
+      _CCCL_DIAG_SUPPRESS_ICC(186)
+      for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+      {
+        _CCCL_DIAG_POP
+        {
+          assert(v1[i] == (v3[i] != v2[i]));
+        }
+      }
+    }
+  }
+  return true;
+}
+
+int main(int, char**)
+{
+  test_op_xor_eq<0>();
+  test_op_xor_eq<1>();
+  test_op_xor_eq<31>();
+  test_op_xor_eq<32>();
+  test_op_xor_eq<33>();
+  test_op_xor_eq<63>();
+  test_op_xor_eq<64>();
+  test_op_xor_eq<65>();
+  test_op_xor_eq<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_op_xor_eq<0>(), "");
+  static_assert(test_op_xor_eq<1>(), "");
+  static_assert(test_op_xor_eq<31>(), "");
+  static_assert(test_op_xor_eq<32>(), "");
+  static_assert(test_op_xor_eq<33>(), "");
+  static_assert(test_op_xor_eq<63, 0, 6>(), "");
+  static_assert(test_op_xor_eq<63, 6>(), "");
+  static_assert(test_op_xor_eq<64, 0, 6>(), "");
+  static_assert(test_op_xor_eq<64, 6>(), "");
+  static_assert(test_op_xor_eq<65, 0, 6>(), "");
+  static_assert(test_op_xor_eq<65, 6>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp
new file mode 100644
index 0000000000..e7f3ba1fed
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_all.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& reset(); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_reset_all()
+{
+  cuda::std::bitset<N> v;
+  v.set();
+  v.reset();
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_ICC(186)
+  for (cuda::std::size_t i = 0; i < v.size(); ++i)
+  {
+    _CCCL_DIAG_POP
+    {
+      assert(!v[i]);
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_reset_all<0>();
+  test_reset_all<1>();
+  test_reset_all<31>();
+  test_reset_all<32>();
+  test_reset_all<33>();
+  test_reset_all<63>();
+  test_reset_all<64>();
+  test_reset_all<65>();
+  test_reset_all<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp
new file mode 100644
index 0000000000..787dedc2d4
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.out_of_range.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-no-exceptions
+
+// bitset<N>& reset(size_t pos); // constexpr since C++23
+
+// Make sure we throw ::std::out_of_range when calling reset() on an OOB index.
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+
+int main(int, char**)
+{
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    {
+      cuda::std::bitset<0> v;
+      try
+      {
+        v.reset(0);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<1> v("0");
+      try
+      {
+        v.reset(2);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<10> v("0000000000");
+      try
+      {
+        v.reset(10);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    })
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp
new file mode 100644
index 0000000000..f5f8ff8838
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/reset_one.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& reset(size_t pos); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+// TEST_MSVC_DIAGNOSTIC_IGNORED(6294) // Ill-defined for-loop:  initial condition does not satisfy test.  Loop body not
+// executed.
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N, cuda::std::size_t Start = 0, cuda::std::size_t End = static_cast<cuda::std::size_t>(-1)>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_reset_one()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  if (Start >= 9)
+  {
+    assert(End >= cases.size());
+  }
+  for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c)
+  {
+    for (cuda::std::size_t i = 0; i != N; ++i)
+    {
+      cuda::std::bitset<N> v(cases[c]);
+      v.reset(i);
+      assert(v[i] == false);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test_reset_one<0>();
+  test_reset_one<1>();
+  test_reset_one<31>();
+  test_reset_one<32>();
+  test_reset_one<33>();
+  test_reset_one<63>();
+  test_reset_one<64>();
+  test_reset_one<65>();
+  test_reset_one<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_reset_one<0>(), "");
+  static_assert(test_reset_one<1>(), "");
+  static_assert(test_reset_one<31>(), "");
+  static_assert(test_reset_one<32>(), "");
+  static_assert(test_reset_one<33>(), "");
+  static_assert(test_reset_one<63, 0, 6>(), "");
+  static_assert(test_reset_one<63, 6>(), "");
+  static_assert(test_reset_one<64, 0, 3>(), "");
+  static_assert(test_reset_one<64, 3, 6>(), "");
+  static_assert(test_reset_one<64, 6, 9>(), "");
+  static_assert(test_reset_one<64, 9>(), "");
+  static_assert(test_reset_one<65, 0, 3>(), "");
+  static_assert(test_reset_one<65, 3, 6>(), "");
+  static_assert(test_reset_one<65, 6, 9>(), "");
+  static_assert(test_reset_one<65, 9>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp
new file mode 100644
index 0000000000..5a4f351c7b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift.pass.cpp
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N> operator>>(size_t pos) const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N, cuda::std::size_t Start = 0, cuda::std::size_t End = static_cast<cuda::std::size_t>(-1)>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_right_shift()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  if (Start >= 9)
+  {
+    assert(End >= cases.size());
+  }
+  for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c)
+  {
+    for (cuda::std::size_t s = 0; s <= N + 1; ++s)
+    {
+      cuda::std::bitset<N> v1(cases[c]);
+      cuda::std::bitset<N> v2 = v1;
+      assert((v1 >>= s) == (v2 >> s));
+    }
+  }
+  return true;
+}
+
+__host__ __device__ int main(int, char**)
+{
+  test_right_shift<0>();
+  test_right_shift<1>();
+  test_right_shift<31>();
+  test_right_shift<32>();
+  test_right_shift<33>();
+  test_right_shift<63>();
+  test_right_shift<64>();
+  test_right_shift<65>();
+  test_right_shift<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_right_shift<0>(), "");
+  static_assert(test_right_shift<1>(), "");
+  static_assert(test_right_shift<31>(), "");
+  static_assert(test_right_shift<32>(), "");
+  static_assert(test_right_shift<33>(), "");
+  static_assert(test_right_shift<63, 0, 6>(), "");
+  static_assert(test_right_shift<63, 6>(), "");
+  static_assert(test_right_shift<64, 0, 6>(), "");
+  static_assert(test_right_shift<64, 6>(), "");
+  static_assert(test_right_shift<65, 0, 3>(), "");
+  static_assert(test_right_shift<65, 3, 6>(), "");
+  static_assert(test_right_shift<65, 6, 9>(), "");
+  static_assert(test_right_shift<65, 9>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
new file mode 100644
index 0000000000..a4e8327b70
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000
+
+// bitset<N>& operator<<=(size_t pos); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N, cuda::std::size_t Start = 0, cuda::std::size_t End = static_cast<cuda::std::size_t>(-1)>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_right_shift()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  if (Start >= 9)
+  {
+    assert(End >= cases.size());
+  }
+  for (cuda::std::size_t c = Start; c != cases.size() && c != End; ++c)
+  {
+    for (cuda::std::size_t s = 0; s <= N + 1; ++s)
+    {
+      cuda::std::bitset<N> v1(cases[c]);
+      cuda::std::bitset<N> v2 = v1;
+      v1 >>= s;
+      _CCCL_DIAG_PUSH
+      _CCCL_DIAG_SUPPRESS_ICC(186)
+      for (cuda::std::size_t i = 0; i < v1.size(); ++i)
+      {
+        if (i + s < v1.size())
+        {
+          _CCCL_DIAG_POP
+          {
+            assert(v1[i] == v2[i + s]);
+          }
+        }
+        else
+        {
+          assert(v1[i] == 0);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+__host__ __device__ int main(int, char**)
+{
+  test_right_shift<0>();
+  test_right_shift<1>();
+  test_right_shift<31>();
+  test_right_shift<32>();
+  test_right_shift<33>();
+  test_right_shift<63>();
+  test_right_shift<64>();
+  test_right_shift<65>();
+  test_right_shift<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_right_shift<0>(), "");
+  static_assert(test_right_shift<1>(), "");
+  static_assert(test_right_shift<31>(), "");
+  static_assert(test_right_shift<32>(), "");
+  static_assert(test_right_shift<33>(), "");
+  static_assert(test_right_shift<63, 0, 3>(), "");
+  static_assert(test_right_shift<63, 3, 6>(), "");
+  static_assert(test_right_shift<63, 6, 9>(), "");
+  static_assert(test_right_shift<63, 9>(), "");
+  static_assert(test_right_shift<64, 0, 3>(), "");
+  static_assert(test_right_shift<64, 3, 6>(), "");
+  static_assert(test_right_shift<64, 6, 9>(), "");
+  static_assert(test_right_shift<64, 9>(), "");
+  static_assert(test_right_shift<65, 0, 3>(), "");
+  static_assert(test_right_shift<65, 3, 6>(), "");
+  static_assert(test_right_shift<65, 6, 9>(), "");
+  static_assert(test_right_shift<65, 9>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp
new file mode 100644
index 0000000000..f08ff34b5b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_all.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& set(); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_set_all()
+{
+  cuda::std::bitset<N> v;
+  v.set();
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_ICC(186)
+  for (cuda::std::size_t i = 0; i < v.size(); ++i)
+  {
+    _CCCL_DIAG_POP
+    {
+      assert(v[i]);
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_set_all<0>();
+  test_set_all<1>();
+  test_set_all<31>();
+  test_set_all<32>();
+  test_set_all<33>();
+  test_set_all<63>();
+  test_set_all<64>();
+  test_set_all<65>();
+  test_set_all<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp
new file mode 100644
index 0000000000..810f9210d8
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.out_of_range.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-no-exceptions
+
+// bitset<N>& set(size_t pos, bool val = true); // constexpr since C++23
+
+// Make sure we throw ::std::out_of_range when calling set() on an OOB index.
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+
+int main(int, char**)
+{
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    {
+      cuda::std::bitset<0> v;
+      try
+      {
+        v.set(0);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<1> v("0");
+      try
+      {
+        v.set(2);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<10> v("0000000000");
+      try
+      {
+        v.set(10);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    })
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp
new file mode 100644
index 0000000000..b619250c02
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/set_one.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N>& set(size_t pos, bool val = true); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_set_one()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> v(cases[c]);
+    if (v.size() > 0)
+    {
+      cuda::std::size_t middle = v.size() / 2;
+      v.set(middle);
+      assert(v[middle] == true);
+      v.set(middle, false);
+      assert(v[middle] == false);
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_set_one<0>();
+  test_set_one<1>();
+  test_set_one<31>();
+  test_set_one<32>();
+  test_set_one<33>();
+  test_set_one<63>();
+  test_set_one<64>();
+  test_set_one<65>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_set_one<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp
new file mode 100644
index 0000000000..0c65c13631
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/size.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// size_t count() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_size()
+{
+  const cuda::std::bitset<N> v;
+  assert(v.size() == N);
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_size<0>();
+  test_size<1>();
+  test_size<31>();
+  test_size<32>();
+  test_size<33>();
+  test_size<63>();
+  test_size<64>();
+  test_size<65>();
+  test_size<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp
new file mode 100644
index 0000000000..efd0195344
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.out_of_range.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-no-exceptions
+
+// constexpr bool test(size_t pos) const;
+
+// Make sure we throw cuda::std::out_of_range when calling test() on an OOB index.
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+
+int main(int, char**)
+{
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    {
+      cuda::std::bitset<0> v;
+      try
+      {
+        (void) v.test(0);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<1> v("0");
+      try
+      {
+        (void) v.test(2);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    } {
+      cuda::std::bitset<10> v("0000000000");
+      try
+      {
+        (void) v.test(10);
+        assert(false);
+      }
+      catch (::std::out_of_range const&)
+      {}
+    })
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp
new file mode 100644
index 0000000000..909e798ea8
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/test.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bool test(size_t pos) const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+_CCCL_NV_DIAG_SUPPRESS(186)
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_test()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> const v(cases[c]);
+    if (v.size() > 0)
+    {
+      cuda::std::size_t middle = v.size() / 2;
+      bool b                   = v.test(middle);
+      assert(b == v[middle]);
+    }
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_test<0>();
+  test_test<1>();
+  test_test<31>();
+  test_test<32>();
+  test_test<33>();
+  test_test<63>();
+  test_test<64>();
+  test_test<65>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  test_test<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp
new file mode 100644
index 0000000000..801a51535a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_string.pass.cpp
@@ -0,0 +1,185 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// test:
+
+// template <class charT, class traits, class Allocator>
+// basic_string<charT, traits, Allocator>
+// to_string(charT zero = charT('0'), charT one = charT('1')) const; // constexpr since C++23
+//
+// template <class charT, class traits>
+// basic_string<charT, traits, allocator<charT> > to_string() const; // constexpr since C++23
+//
+// template <class charT>
+// basic_string<charT, char_traits<charT>, allocator<charT> > to_string() const; // constexpr since C++23
+//
+// basic_string<char, char_traits<char>, allocator<char> > to_string() const; // constexpr since C++23
+
+#include <cuda/std/version>
+
+#ifndef __LIBCUDACXX_HAS_STRING
+
+int main(int, char**)
+{
+  return 0;
+}
+
+#else
+
+#  include <cuda/std/bitset>
+#  include <cuda/std/cassert>
+#  include <cuda/std/cstddef>
+#  include <cuda/std/memory> // for cuda::std::allocator
+#  include <cuda/std/string>
+#  include <cuda/std/vector>
+
+#  include "../bitset_test_cases.h"
+#  include "test_macros.h"
+
+template <class CharT, cuda::std::size_t N>
+TEST_CONSTEXPR_CXX14 void
+check_equal(cuda::std::basic_string<CharT> const& s, cuda::std::bitset<N> const& b, CharT zero, CharT one)
+{
+  assert(s.size() == b.size());
+  for (cuda::std::size_t i = 0; i < b.size(); ++i)
+  {
+    if (b[i])
+    {
+      assert(s[b.size() - 1 - i] == one);
+    }
+    else
+    {
+      assert(s[b.size() - 1 - i] == zero);
+    }
+  }
+}
+
+template <cuda::std::size_t N>
+TEST_CONSTEXPR_CXX14 bool test_to_string()
+{
+  cuda::std::vector<cuda::std::bitset<N>> const cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> const v = cases[c];
+    {
+      cuda::std::string s = v.template to_string<char>();
+      check_equal(s, v, '0', '1');
+    }
+    {
+      cuda::std::string s = v.to_string();
+      check_equal(s, v, '0', '1');
+    }
+    {
+      cuda::std::string s = v.template to_string<char>('0');
+      check_equal(s, v, '0', '1');
+    }
+    {
+      cuda::std::string s = v.to_string('0');
+      check_equal(s, v, '0', '1');
+    }
+    {
+      cuda::std::string s = v.template to_string<char>('0', '1');
+      check_equal(s, v, '0', '1');
+    }
+    {
+      cuda::std::string s = v.to_string('0', '1');
+      check_equal(s, v, '0', '1');
+    }
+    {
+      cuda::std::string s = v.to_string('x', 'y');
+      check_equal(s, v, 'x', 'y');
+    }
+  }
+  return true;
+}
+
+#  ifndef TEST_HAS_NO_WIDE_CHARACTERS
+template <cuda::std::size_t N>
+TEST_CONSTEXPR_CXX14 bool test_to_string_wchar()
+{
+  cuda::std::vector<cuda::std::bitset<N>> const cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c = 0; c != cases.size(); ++c)
+  {
+    cuda::std::bitset<N> const v = cases[c];
+    {
+      cuda::std::wstring s =
+        v.template to_string<wchar_t, cuda::std::char_traits<wchar_t>, cuda::std::allocator<wchar_t>>();
+      check_equal(s, v, L'0', L'1');
+    }
+    {
+      cuda::std::wstring s = v.template to_string<wchar_t, cuda::std::char_traits<wchar_t>>();
+      check_equal(s, v, L'0', L'1');
+    }
+    {
+      cuda::std::wstring s =
+        v.template to_string<wchar_t, cuda::std::char_traits<wchar_t>, cuda::std::allocator<wchar_t>>('0');
+      check_equal(s, v, L'0', L'1');
+    }
+    {
+      cuda::std::wstring s = v.template to_string<wchar_t, cuda::std::char_traits<wchar_t>>('0');
+      check_equal(s, v, L'0', L'1');
+    }
+    {
+      cuda::std::wstring s =
+        v.template to_string<wchar_t, cuda::std::char_traits<wchar_t>, cuda::std::allocator<wchar_t>>('0', '1');
+      check_equal(s, v, L'0', L'1');
+    }
+    {
+      cuda::std::wstring s = v.template to_string<wchar_t, cuda::std::char_traits<wchar_t>>('0', '1');
+      check_equal(s, v, L'0', L'1');
+    }
+  }
+  return true;
+}
+#  endif
+
+int main(int, char**)
+{
+  test_to_string<0>();
+  test_to_string<1>();
+  test_to_string<31>();
+  test_to_string<32>();
+  test_to_string<33>();
+  test_to_string<63>();
+  test_to_string<64>();
+  test_to_string<65>();
+  test_to_string<1000>(); // not in constexpr because of constexpr evaluation step limits
+#  if TEST_STD_VER >= 2023
+  static_assert(test_to_string<0>(), "");
+  static_assert(test_to_string<1>(), "");
+  static_assert(test_to_string<31>(), "");
+  static_assert(test_to_string<32>(), "");
+  static_assert(test_to_string<33>(), "");
+  static_assert(test_to_string<63>(), "");
+  static_assert(test_to_string<64>(), "");
+  static_assert(test_to_string<65>(), "");
+#  endif
+
+#  ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_to_string_wchar<0>();
+  test_to_string_wchar<1>();
+  test_to_string_wchar<31>();
+  test_to_string_wchar<32>();
+  test_to_string_wchar<33>();
+  test_to_string_wchar<63>();
+  test_to_string_wchar<64>();
+  test_to_string_wchar<65>();
+  test_to_string_wchar<1000>(); // not in constexpr because of constexpr evaluation step limits
+#    if TEST_STD_VER >= 2023
+  static_assert(test_to_string_wchar<0>(), "");
+  static_assert(test_to_string_wchar<1>(), "");
+  static_assert(test_to_string_wchar<31>(), "");
+  static_assert(test_to_string_wchar<32>(), "");
+  static_assert(test_to_string_wchar<33>(), "");
+  static_assert(test_to_string_wchar<63>(), "");
+#    endif
+#  endif
+  return 0;
+}
+
+#endif
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
new file mode 100644
index 0000000000..e76cfd4aed
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// unsigned long long to_ullong() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+// #include <cuda/std/algorithm>
+#include <cuda/std/cassert>
+#include <cuda/std/climits>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_to_ullong()
+{
+  const cuda::std::size_t M = sizeof(unsigned long long) * CHAR_BIT < N ? sizeof(unsigned long long) * CHAR_BIT : N;
+  const bool is_M_zero      = cuda::std::integral_constant<bool, M == 0>::value; // avoid compiler warnings
+  const cuda::std::size_t X =
+    is_M_zero ? sizeof(unsigned long long) * CHAR_BIT - 1 : sizeof(unsigned long long) * CHAR_BIT - M;
+  const unsigned long long max = is_M_zero ? 0 : (unsigned long long) (-1) >> X;
+  unsigned long long tests[]   = {
+    0,
+    cuda::std::min<unsigned long long>(1, max),
+    cuda::std::min<unsigned long long>(2, max),
+    cuda::std::min<unsigned long long>(3, max),
+    cuda::std::min(max, max - 3),
+    cuda::std::min(max, max - 2),
+    cuda::std::min(max, max - 1),
+    max};
+  for (unsigned long long j : tests)
+  {
+    cuda::std::bitset<N> v(j);
+    assert(j == v.to_ullong());
+  }
+  { // test values bigger than can fit into the bitset
+    const unsigned long long val  = 0x55AAAAFFFFAAAA55ULL;
+    const bool canFit             = N < sizeof(unsigned long long) * CHAR_BIT;
+    const unsigned long long mask = canFit ? (1ULL << (canFit ? N : 0)) - 1 : (unsigned long long) (-1); // avoid
+                                                                                                         // compiler
+                                                                                                         // warnings
+    cuda::std::bitset<N> v(val);
+    assert(v.to_ullong() == (val & mask)); // we shouldn't return bit patterns from outside the limits of the bitset.
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_to_ullong<0>();
+  test_to_ullong<1>();
+  test_to_ullong<31>();
+  test_to_ullong<32>();
+  test_to_ullong<33>();
+  test_to_ullong<63>();
+  test_to_ullong<64>();
+  test_to_ullong<65>();
+  test_to_ullong<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014 && (!defined(_CCCL_CUDACC_BELOW_11_8) || !defined(_CCCL_COMPILER_MSVC))
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
new file mode 100644
index 0000000000..a4400a59fc
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// unsigned long to_ulong() const; // constexpr since C++23
+
+#include <cuda/std/bitset>
+// #include <cuda/std/algorithm>
+#include <cuda/std/cassert>
+#include <cuda/std/climits>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_to_ulong()
+{
+  const cuda::std::size_t M   = sizeof(unsigned long) * CHAR_BIT < N ? sizeof(unsigned long) * CHAR_BIT : N;
+  const bool is_M_zero        = cuda::std::integral_constant<bool, M == 0>::value; // avoid compiler warnings
+  const cuda::std::size_t X   = is_M_zero ? sizeof(unsigned long) * CHAR_BIT - 1 : sizeof(unsigned long) * CHAR_BIT - M;
+  const cuda::std::size_t max = is_M_zero ? 0 : cuda::std::size_t(cuda::std::numeric_limits<unsigned long>::max()) >> X;
+  cuda::std::size_t tests[]   = {
+    0,
+    cuda::std::min<cuda::std::size_t>(1, max),
+    cuda::std::min<cuda::std::size_t>(2, max),
+    cuda::std::min<cuda::std::size_t>(3, max),
+    cuda::std::min(max, max - 3),
+    cuda::std::min(max, max - 2),
+    cuda::std::min(max, max - 1),
+    max};
+  for (cuda::std::size_t j : tests)
+  {
+    cuda::std::bitset<N> v(j);
+    assert(j == v.to_ulong());
+  }
+
+  { // test values bigger than can fit into the bitset
+    const unsigned long val  = 0x5AFFFFA5UL;
+    const bool canFit        = N < sizeof(unsigned long) * CHAR_BIT;
+    const unsigned long mask = canFit ? (1UL << (canFit ? N : 0)) - 1 : (unsigned long) (-1); // avoid compiler warnings
+    cuda::std::bitset<N> v(val);
+    assert(v.to_ulong() == (val & mask)); // we shouldn't return bit patterns from outside the limits of the bitset.
+  }
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  test_to_ulong<0>();
+  test_to_ulong<1>();
+  test_to_ulong<31>();
+  test_to_ulong<32>();
+  test_to_ulong<33>();
+  test_to_ulong<63>();
+  test_to_ulong<64>();
+  test_to_ulong<65>();
+  test_to_ulong<1000>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if TEST_STD_VER >= 2014 && (!defined(_CCCL_CUDACC_BELOW_11_8) || !defined(_CCCL_COMPILER_MSVC))
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp
new file mode 100644
index 0000000000..c47ae6aeb9
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_and.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N> operator&(const bitset<N>& lhs, const bitset<N>& rhs); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_and()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1)
+  {
+    for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2)
+    {
+      cuda::std::bitset<N> v1(cases[c1]);
+      cuda::std::bitset<N> v2(cases[c2]);
+      cuda::std::bitset<N> v3 = v1;
+      assert((v1 & v2) == (v3 &= v2));
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test_op_and<0>();
+  test_op_and<1>();
+  test_op_and<31>();
+  test_op_and<32>();
+  test_op_and<33>();
+  test_op_and<63>();
+  test_op_and<64>();
+  test_op_and<65>();
+  test_op_and<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_op_and<0>(), "");
+  static_assert(test_op_and<1>(), "");
+  static_assert(test_op_and<31>(), "");
+  static_assert(test_op_and<32>(), "");
+  static_assert(test_op_and<33>(), "");
+  static_assert(test_op_and<63>(), "");
+  static_assert(test_op_and<64>(), "");
+  static_assert(test_op_and<65>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp
new file mode 100644
index 0000000000..3b2562c417
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_not.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N> operator^(const bitset<N>& lhs, const bitset<N>& rhs); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_not()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1)
+  {
+    for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2)
+    {
+      cuda::std::bitset<N> v1(cases[c1]);
+      cuda::std::bitset<N> v2(cases[c2]);
+      cuda::std::bitset<N> v3 = v1;
+      assert((v1 ^ v2) == (v3 ^= v2));
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test_op_not<0>();
+  test_op_not<1>();
+  test_op_not<31>();
+  test_op_not<32>();
+  test_op_not<33>();
+  test_op_not<63>();
+  test_op_not<64>();
+  test_op_not<65>();
+  test_op_not<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_op_not<0>(), "");
+  static_assert(test_op_not<1>(), "");
+  static_assert(test_op_not<31>(), "");
+  static_assert(test_op_not<32>(), "");
+  static_assert(test_op_not<33>(), "");
+  static_assert(test_op_not<63>(), "");
+  static_assert(test_op_not<64>(), "");
+  static_assert(test_op_not<65>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp
new file mode 100644
index 0000000000..5eb50ae733
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/op_or.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bitset<N> operator|(const bitset<N>& lhs, const bitset<N>& rhs); // constexpr since C++23
+
+#include <cuda/std/bitset>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+
+#include "../bitset_test_cases.h"
+#include "test_macros.h"
+
+template <cuda::std::size_t N>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test_op_or()
+{
+  auto const& cases = get_test_cases(cuda::std::integral_constant<int, N>());
+  for (cuda::std::size_t c1 = 0; c1 != cases.size(); ++c1)
+  {
+    for (cuda::std::size_t c2 = 0; c2 != cases.size(); ++c2)
+    {
+      cuda::std::bitset<N> v1(cases[c1]);
+      cuda::std::bitset<N> v2(cases[c2]);
+      cuda::std::bitset<N> v3 = v1;
+      assert((v1 | v2) == (v3 |= v2));
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test_op_or<0>();
+  test_op_or<1>();
+  test_op_or<31>();
+  test_op_or<32>();
+  test_op_or<33>();
+  test_op_or<63>();
+  test_op_or<64>();
+  test_op_or<65>();
+  test_op_or<1000>(); // not in constexpr because of constexpr evaluation step limits
+// 11.4 added support for constexpr device vars needed here
+#if TEST_STD_VER >= 2014 && !defined(_CCCL_CUDACC_BELOW_11_4)
+  static_assert(test_op_or<0>(), "");
+  static_assert(test_op_or<1>(), "");
+  static_assert(test_op_or<31>(), "");
+  static_assert(test_op_or<32>(), "");
+  static_assert(test_op_or<33>(), "");
+  static_assert(test_op_or<63>(), "");
+  static_assert(test_op_or<64>(), "");
+  static_assert(test_op_or<65>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp
new file mode 100644
index 0000000000..45ffa4bb59
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_in.pass.cpp
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-localization
+
+// test:
+
+// template <class charT, class traits, size_t N>
+// basic_istream<charT, traits>&
+// operator>>(basic_istream<charT, traits>& is, bitset<N>& x);
+
+#include <cuda/std/version>
+
+#ifndef _LIBCUDACXX_HAS_SSTREAM
+int main(int, char**)
+{
+  return 0;
+}
+#else
+
+#  include <cuda/std/bitset>
+#  include <cuda/std/cassert>
+#  include <cuda/std/sstream>
+
+#  include "test_macros.h"
+
+int main(int, char**)
+{
+  {
+    cuda::std::istringstream in("01011010");
+    cuda::std::bitset<8> b;
+    in >> b;
+    assert(b.to_ulong() == 0x5A);
+  }
+  {
+    // Make sure that input-streaming an empty bitset does not cause the
+    // failbit to be set (LWG 3199).
+    cuda::std::istringstream in("01011010");
+    cuda::std::bitset<0> b;
+    in >> b;
+    assert(b.to_string() == "");
+    assert(!in.bad());
+    assert(!in.fail());
+    assert(!in.eof());
+    assert(in.good());
+  }
+#  ifndef TEST_HAS_NO_EXCEPTIONS
+  {
+    cuda::std::stringbuf sb;
+    cuda::std::istream is(&sb);
+    is.exceptions(cuda::std::ios::failbit);
+
+    bool threw = false;
+    try
+    {
+      cuda::std::bitset<8> b;
+      is >> b;
+    }
+    catch (cuda::std::ios::failure const&)
+    {
+      threw = true;
+    }
+
+    assert(!is.bad());
+    assert(is.fail());
+    assert(is.eof());
+    assert(threw);
+  }
+  {
+    cuda::std::stringbuf sb;
+    cuda::std::istream is(&sb);
+    is.exceptions(cuda::std::ios::eofbit);
+
+    bool threw = false;
+    try
+    {
+      cuda::std::bitset<8> b;
+      is >> b;
+    }
+    catch (cuda::std::ios::failure const&)
+    {
+      threw = true;
+    }
+
+    assert(!is.bad());
+    assert(is.fail());
+    assert(is.eof());
+    assert(threw);
+  }
+#  endif // TEST_HAS_NO_EXCEPTIONS
+
+  return 0;
+}
+
+#endif
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp
new file mode 100644
index 0000000000..10c7392b95
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.operators/stream_out.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-localization
+
+// test:
+
+// template <class charT, class traits, size_t N>
+// basic_ostream<charT, traits>&
+// operator<<(basic_ostream<charT, traits>& os, const bitset<N>& x);
+
+#include <cuda/std/version>
+
+#ifndef _LIBCUDACXX_HAS_SSTREAM
+int main(int, char**)
+{
+  return 0;
+}
+#else
+
+#  include <cuda/std/bitset>
+#  include <cuda/std/cassert>
+#  include <cuda/std/sstream>
+
+#  include "test_macros.h"
+
+int main(int, char**)
+{
+  cuda::std::ostringstream os;
+  cuda::std::bitset<8> b(0x5A);
+  os << b;
+  assert(os.str() == "01011010");
+
+  return 0;
+}
+
+#endif
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h
new file mode 100644
index 0000000000..8351e87795
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset_test_cases.h
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBCUDACXX_TEST_BITSET_TEST_CASES_H
+#define LIBCUDACXX_TEST_BITSET_TEST_CASES_H
+
+#include <cuda/std/array>
+#include <cuda/std/bitset>
+#include <cuda/std/tuple>
+
+#include "template_cost_testing.h" // for base cases of REPEAT_*
+#include "test_macros.h"
+
+#if TEST_STD_VER == 2011
+#  define BITSET_TEST_CONSTEXPR const
+#else
+#  define BITSET_TEST_CONSTEXPR TEST_CONSTEXPR_GLOBAL
+#endif
+
+#define NUMARGS(...) (::cuda::std::tuple_size<decltype(::cuda::std::make_tuple(__VA_ARGS__))>::value)
+#define DEFINE_CASES(N, ...)                                                                                    \
+  __host__ __device__ BITSET_TEST_CONSTEXPR cuda::std::array<const char*, NUMARGS(__VA_ARGS__)> get_test_cases( \
+    cuda::std::integral_constant<int, N>)                                                                       \
+  {                                                                                                             \
+    return {{__VA_ARGS__}};                                                                                     \
+  }
+
+DEFINE_CASES(0, "")
+
+DEFINE_CASES(1, "0", "1")
+
+DEFINE_CASES(2, "00", "01", "10", "11")
+
+DEFINE_CASES(
+  31,
+  "0000000000000000000000000000000",
+  "0000000000000000000000000000001",
+  "1000000000000000000000000000000",
+  "1000000000000000000000000000001",
+  "1000000000000000000001000000001",
+  "0000000000000000111111111111111",
+  "1000000000000000111111111111111",
+  "1111111111111111000000000000000",
+  "1111111111111111000000000000001",
+  "1010101010101010101010101010101",
+  "0101010101010101010101010101010",
+  "1111111111111111111111111111111")
+
+DEFINE_CASES(
+  32,
+  "00000000000000000000000000000000",
+  "00000000000000000000000000000001",
+  "10000000000000000000000000000000",
+  "10000000000000000000000000000001",
+  "10000000000000000000111000000001",
+  "00000000000000001111111111111111",
+  "10000000000000001111111111111111",
+  "11111111111111110000000000000000",
+  "11111111111111110000000000000001",
+  "10101010101010101010101010101010",
+  "01010101010101010101010101010101",
+  "11111111111111111111111111111111")
+
+DEFINE_CASES(
+  33,
+  "000000000000000000000000000000000",
+  "000000000000000000000000000000001",
+  "100000000000000000000000000000000",
+  "100000000000000000000000000000001",
+  "100000000000000000001110000000001",
+  "000000000000000011111111111111111",
+  "100000000000000011111111111111111",
+  "111111111111111100000000000000000",
+  "111111111111111100000000000000001",
+  "101010101010101010101010101010101",
+  "010101010101010101010101010101010",
+  "111111111111111111111111111111111")
+
+DEFINE_CASES(
+  63,
+  "000000000000000000000000000000000000000000000000000000000000000",
+  "000000000000000000000000000000000000000000000000000000000000001",
+  "100000000000000000000000000000000000000000000000000000000000000",
+  "100000000000000000000000000000000000000000000000000000000000001",
+  "100000000000000000000000001111100000000000000000000000000000001",
+  "000000000000000000000000000000001111111111111111111111111111111",
+  "100000000000000000000000000000001111111111111111111111111111111",
+  "111111111111111111111111111111110000000000000000000000000000000",
+  "111111111111111111111111111111110000000000000000000000000000001",
+  "101010101010101010101010101010101010101010101010101010101010101",
+  "010101010101010101010101010101010101010101010101010101010101010",
+  "111111111111111111111111111111111111111111111111111111111111111")
+
+DEFINE_CASES(
+  64,
+  "0000000000000000000000000000000000000000000000000000000000000000",
+  "0000000000000000000000000000000000000000000000000000000000000001",
+  "1000000000000000000000000000000000000000000000000000000000000000",
+  "1000000000000000000000000000000000000000000000000000000000000001",
+  "1000000000000000000000000011111000000000000000000000000000000001",
+  "0000000000000000000000000000000011111111111111111111111111111111",
+  "1000000000000000000000000000000011111111111111111111111111111111",
+  "1111111111111111111111111111111100000000000000000000000000000000",
+  "1111111111111111111111111111111100000000000000000000000000000001",
+  "1010101010101010101010101010101010101010101010101010101010101010",
+  "0101010101010101010101010101010101010101010101010101010101010101",
+  "1111111111111111111111111111111111111111111111111111111111111111")
+
+DEFINE_CASES(
+  65,
+  "00000000000000000000000000000000000000000000000000000000000000000",
+  "00000000000000000000000000000000000000000000000000000000000000001",
+  "10000000000000000000000000000000000000000000000000000000000000000",
+  "10000000000000000000000000000000000000000000000000000000000000001",
+  "10000000000000000000000000011111000000000000000000000000000000001",
+  "00000000000000000000000000000000011111111111111111111111111111111",
+  "10000000000000000000000000000000011111111111111111111111111111111",
+  "11111111111111111111111111111111000000000000000000000000000000000",
+  "11111111111111111111111111111111000000000000000000000000000000001",
+  "10101010101010101010101010101010101010101010101010101010101010101",
+  "01010101010101010101010101010101010101010101010101010101010101010",
+  "11111111111111111111111111111111111111111111111111111111111111111")
+
+#define BITSET_ZERO()    "0"
+#define BITSET_ONE()     "1"
+#define BITSET_ONEZERO() "10"
+#define BITSET_ZEROONE() "10"
+
+#define REPEAT_8(DO_IT) DO_IT() DO_IT() DO_IT() DO_IT() DO_IT() DO_IT() DO_IT() DO_IT()
+#define REPEAT_9(DO_IT) REPEAT_8(DO_IT) DO_IT()
+#define REPEAT_90(DO_IT) \
+  REPEAT_10(DO_IT)       \
+  REPEAT_10(DO_IT)       \
+  REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT) REPEAT_10(DO_IT)
+#define REPEAT_99(DO_IT)  REPEAT_90(DO_IT) REPEAT_9(DO_IT)
+#define REPEAT_400(DO_IT) REPEAT_100(DO_IT) REPEAT_100(DO_IT) REPEAT_100(DO_IT) REPEAT_100(DO_IT)
+#define REPEAT_499(DO_IT) REPEAT_400(DO_IT) REPEAT_99(DO_IT)
+#define REPEAT_900(DO_IT) REPEAT_500(DO_IT) REPEAT_400(DO_IT)
+#define REPEAT_998(DO_IT) REPEAT_900(DO_IT) REPEAT_90(DO_IT) REPEAT_8(DO_IT)
+#define REPEAT_999(DO_IT) REPEAT_900(DO_IT) REPEAT_99(DO_IT)
+
+DEFINE_CASES(
+  1000,
+  REPEAT_1000(BITSET_ZERO),
+  REPEAT_999(BITSET_ZERO) BITSET_ONE(),
+  BITSET_ONE() REPEAT_999(BITSET_ZERO),
+  BITSET_ONE() REPEAT_998(BITSET_ZERO) BITSET_ONE(),
+  BITSET_ONE() REPEAT_400(BITSET_ZERO) REPEAT_99(BITSET_ONE) REPEAT_499(BITSET_ZERO) BITSET_ONE(),
+  REPEAT_500(BITSET_ZERO) REPEAT_500(BITSET_ONE),
+  BITSET_ONE() REPEAT_499(BITSET_ZERO) REPEAT_500(BITSET_ONE),
+  REPEAT_500(BITSET_ONE) REPEAT_500(BITSET_ZERO),
+  REPEAT_500(BITSET_ONE) REPEAT_499(BITSET_ZERO) BITSET_ONE(),
+  REPEAT_500(BITSET_ONEZERO),
+  REPEAT_500(BITSET_ZEROONE),
+  REPEAT_1000(BITSET_ONE))
+
+#endif // !LIBCUDACXX_TEST_BITSET_TEST_CASES_H
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp
new file mode 100644
index 0000000000..6841824725
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/includes.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// test that <cuda/std/bitset> includes <cuda/std/string> and <cuda/std/iosfwd>
+
+#include <cuda/std/bitset>
+
+#include "test_macros.h"
+
+template <class>
+__host__ __device__ void test_typedef()
+{}
+
+int main(int, char**)
+{
+#ifdef _LIBCUDACXX_HAS_STRING
+  { // test for <cuda/std/string>
+    cuda::std::string s;
+    ((void) s);
+  }
+#endif
+  { // test for <cuda/std/iosfwd>
+    test_typedef<cuda::std::ios>();
+    test_typedef<cuda::std::istream>();
+    test_typedef<cuda::std::ostream>();
+    test_typedef<cuda::std::iostream>();
+  }
+
+  return 0;
+}
diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h
index a066348d05..449f5fbbc2 100644
--- a/libcudacxx/test/support/test_macros.h
+++ b/libcudacxx/test/support/test_macros.h
@@ -4,6 +4,7 @@
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 

From 2e44b2c394c55740086132d83d7b31f92e62dd95 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 14 Aug 2024 10:26:01 +0200
Subject: [PATCH 22/33] Refactor placeholder operators (#2233)

---
 .../functional_placeholders_miscellaneous.cu  |  15 +
 thrust/thrust/detail/functional/actor.h       | 198 +++++++---
 thrust/thrust/detail/functional/actor.inl     |  94 -----
 thrust/thrust/detail/functional/argument.h    |  76 ----
 thrust/thrust/detail/functional/composite.h   | 123 ------
 thrust/thrust/detail/functional/operators.h   | 367 +++++++++++++++++-
 .../operators/arithmetic_operators.h          | 263 -------------
 .../operators/assignment_operator.h           |  76 ----
 .../functional/operators/bitwise_operators.h  | 195 ----------
 .../operators/compound_assignment_operators.h | 319 ---------------
 .../functional/operators/logical_operators.h  |  94 -----
 .../functional/operators/operator_adaptors.h  | 126 ------
 .../operators/relational_operators.h          | 177 ---------
 thrust/thrust/detail/functional/placeholder.h |  45 ---
 thrust/thrust/detail/functional/value.h       |  80 ----
 thrust/thrust/functional.h                    |   2 +-
 16 files changed, 516 insertions(+), 1734 deletions(-)
 delete mode 100644 thrust/thrust/detail/functional/actor.inl
 delete mode 100644 thrust/thrust/detail/functional/argument.h
 delete mode 100644 thrust/thrust/detail/functional/composite.h
 delete mode 100644 thrust/thrust/detail/functional/operators/arithmetic_operators.h
 delete mode 100644 thrust/thrust/detail/functional/operators/assignment_operator.h
 delete mode 100644 thrust/thrust/detail/functional/operators/bitwise_operators.h
 delete mode 100644 thrust/thrust/detail/functional/operators/compound_assignment_operators.h
 delete mode 100644 thrust/thrust/detail/functional/operators/logical_operators.h
 delete mode 100644 thrust/thrust/detail/functional/operators/operator_adaptors.h
 delete mode 100644 thrust/thrust/detail/functional/operators/relational_operators.h
 delete mode 100644 thrust/thrust/detail/functional/placeholder.h
 delete mode 100644 thrust/thrust/detail/functional/value.h

diff --git a/thrust/testing/functional_placeholders_miscellaneous.cu b/thrust/testing/functional_placeholders_miscellaneous.cu
index fffc4f7e94..9362e81d72 100644
--- a/thrust/testing/functional_placeholders_miscellaneous.cu
+++ b/thrust/testing/functional_placeholders_miscellaneous.cu
@@ -79,3 +79,18 @@ VectorUnitTest<TestFunctionalPlaceholdersTransformIterator,
   TestFunctionalPlaceholdersTransformIteratorInstanceDevice;
 VectorUnitTest<TestFunctionalPlaceholdersTransformIterator, ThirtyTwoBitTypes, thrust::host_vector, std::allocator>
   TestFunctionalPlaceholdersTransformIteratorInstanceHost;
+
+template <typename T>
+struct TestFunctionalPlaceholdersArgumentValueCategories
+{
+  void operator()() const
+  {
+    using namespace thrust::placeholders;
+    auto expr = _1 * _1 + _2 * _2;
+    T a       = 2;
+    T b       = 3;
+    ASSERT_ALMOST_EQUAL(expr(2, 3), 13); // pass pr-value
+    ASSERT_ALMOST_EQUAL(expr(a, b), 13); // pass l-value
+    ASSERT_ALMOST_EQUAL(expr(::cuda::std::move(a), ::cuda::std::move(b)), 13); // pass x-value
+  }
+};
diff --git a/thrust/thrust/detail/functional/actor.h b/thrust/thrust/detail/functional/actor.h
index e76d67153a..79484aabbe 100644
--- a/thrust/thrust/detail/functional/actor.h
+++ b/thrust/thrust/detail/functional/actor.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2024 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -34,103 +34,183 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/assignment_operator.h>
-#include <thrust/detail/functional/value.h>
-#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/type_deduction.h>
 #include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <thrust/functional.h>
 #include <thrust/tuple.h>
 
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
 THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
 {
+// An actor is a node in an expression template
+template <typename Eval>
+struct actor : Eval
+{
+  constexpr actor() = default;
+
+  _CCCL_HOST_DEVICE actor(const Eval& base)
+      : Eval(base)
+  {}
+
+  template <typename... Ts>
+  _CCCL_HOST_DEVICE auto operator()(Ts&&... ts) const -> decltype(Eval::eval(THRUST_FWD(ts)...))
+  {
+    return Eval::eval(THRUST_FWD(ts)...);
+  }
+
+  template <typename T>
+  _CCCL_HOST_DEVICE auto operator=(const T& _1) const -> decltype(do_assign(*this, _1))
+  {
+    return do_assign(*this, _1);
+  }
+};
+
+template <typename T>
+struct is_actor : ::cuda::std::false_type
+{};
 
-// eval_ref<T> is
-// - T when T is a subclass of thrust::reference
-// - T& otherwise
-// This is used to let thrust::references pass through actor evaluations.
 template <typename T>
-using eval_ref = typename std::conditional<thrust::detail::is_wrapped_reference<T>::value, T, T&>::type;
+struct is_actor<actor<T>> : ::cuda::std::true_type
+{};
 
-template <typename Action, typename Env>
-struct apply_actor
+// a node selecting and returning one of the arguments to the entire expression template
+template <unsigned int Pos>
+struct argument
 {
-  using type = typename Action::template result<Env>::type;
+  template <typename... Ts>
+  _CCCL_HOST_DEVICE auto
+  eval(Ts&&... args) const -> decltype(thrust::get<Pos>(thrust::tuple<Ts...>{THRUST_FWD(args)...}))
+  {
+    return thrust::get<Pos>(thrust::tuple<Ts...>{THRUST_FWD(args)...});
+  }
 };
 
-template <typename Eval>
-struct actor : Eval
+template <unsigned int Pos>
+struct placeholder
 {
-  using eval_type = Eval;
+  using type = actor<argument<Pos>>;
+};
 
-  constexpr actor() = default;
+// composition of actors/nodes
+template <typename...>
+struct composite;
 
-  _CCCL_HOST_DEVICE actor(const Eval& base);
+template <typename Eval, typename SubExpr>
+struct composite<Eval, SubExpr>
+{
+  // TODO(bgruber): drop ctor and use aggregate initialization in C++17
+  _CCCL_HOST_DEVICE composite(const Eval& eval, const SubExpr& subexpr)
+      : m_eval(eval)
+      , m_subexpr(subexpr)
+  {}
 
   template <typename... Ts>
-  _CCCL_HOST_DEVICE typename apply_actor<eval_type, thrust::tuple<eval_ref<Ts>...>>::type operator()(Ts&&... ts) const;
+  _CCCL_HOST_DEVICE auto eval(Ts&&... args) const
+    -> decltype(::cuda::std::declval<Eval>().eval(::cuda::std::declval<SubExpr>().eval(THRUST_FWD(args)...)))
+  {
+    return m_eval.eval(m_subexpr.eval(THRUST_FWD(args)...));
+  }
 
-  template <typename T>
-  _CCCL_HOST_DEVICE typename assign_result<Eval, T>::type operator=(const T& _1) const;
-}; // end actor
+private:
+  Eval m_eval;
+  SubExpr m_subexpr;
+};
 
-// in general, as_actor should turn things into values
-template <typename T>
-struct as_actor
+template <typename Eval, typename SubExpr1, typename SubExpr2>
+struct composite<Eval, SubExpr1, SubExpr2>
 {
-  using type = value<T>;
+  // TODO(bgruber): drop ctor and use aggregate initialization in C++17
+  _CCCL_HOST_DEVICE composite(const Eval& eval, const SubExpr1& subexpr1, const SubExpr2& subexpr2)
+      : m_eval(eval)
+      , m_subexpr1(subexpr1)
+      , m_subexpr2(subexpr2)
+  {}
 
-  static inline _CCCL_HOST_DEVICE type convert(const T& x)
+  template <typename... Ts>
+  _CCCL_HOST_DEVICE auto eval(Ts&&... args) const
+    -> decltype(::cuda::std::declval<Eval>().eval(::cuda::std::declval<SubExpr1>().eval(THRUST_FWD(args)...),
+                                                  ::cuda::std::declval<SubExpr2>().eval(THRUST_FWD(args)...)))
   {
-    return val(x);
-  } // end convert()
-}; // end as_actor
+    return m_eval.eval(m_subexpr1.eval(THRUST_FWD(args)...), m_subexpr2.eval(THRUST_FWD(args)...));
+  }
+
+private:
+  Eval m_eval;
+  SubExpr1 m_subexpr1;
+  SubExpr2 m_subexpr2;
+};
 
-// specialization for things which are already actors
 template <typename Eval>
-struct as_actor<actor<Eval>>
+struct actor;
+
+// Adapts a transparent unary functor from functional.h (e.g. thrust::negate<>) into the Eval interface.
+template <typename F>
+struct operator_adaptor : F
 {
-  using type = actor<Eval>;
+  _CCCL_HOST_DEVICE operator_adaptor(F f)
+      : F(::cuda::std::move(f))
+  {}
 
-  static inline _CCCL_HOST_DEVICE const type& convert(const actor<Eval>& x)
+  template <typename... Ts>
+  _CCCL_HOST_DEVICE auto eval(Ts&&... args) const -> decltype(F{}(THRUST_FWD(args)...))
   {
-    return x;
-  } // end convert()
-}; // end as_actor
+    return static_cast<const F&>(*this)(THRUST_FWD(args)...);
+  }
+};
 
+// a node returning a fixed value
 template <typename T>
-typename as_actor<T>::type _CCCL_HOST_DEVICE make_actor(const T& x)
+struct value
 {
-  return as_actor<T>::convert(x);
-} // end make_actor()
+  T m_val;
 
-} // namespace functional
+  template <typename... Ts>
+  _CCCL_HOST_DEVICE T eval(Ts&&...) const
+  {
+    return m_val;
+  }
+};
 
-// provide specializations for result_of for nullary, unary, and binary invocations of actor
-template <typename Eval>
-struct result_of_adaptable_function<thrust::detail::functional::actor<Eval>()>
+template <typename T>
+_CCCL_HOST_DEVICE auto make_actor(T&& x) -> actor<value<::cuda::std::__decay_t<T>>>
 {
-  using type =
-    typename thrust::detail::functional::apply_actor<thrust::detail::functional::actor<Eval>, thrust::tuple<>>::type;
-}; // end result_of
+  return {{THRUST_FWD(x)}};
+}
 
-template <typename Eval, typename Arg1>
-struct result_of_adaptable_function<thrust::detail::functional::actor<Eval>(Arg1)>
+template <typename Eval>
+_CCCL_HOST_DEVICE auto make_actor(actor<Eval> x) -> actor<Eval>
 {
-  using type =
-    typename thrust::detail::functional::apply_actor<thrust::detail::functional::actor<Eval>, thrust::tuple<Arg1>>::type;
-}; // end result_of
+  return x;
+}
 
-template <typename Eval, typename Arg1, typename Arg2>
-struct result_of_adaptable_function<thrust::detail::functional::actor<Eval>(Arg1, Arg2)>
+template <typename Eval, typename SubExpr>
+_CCCL_HOST_DEVICE auto compose(Eval e, const SubExpr& subexpr)
+  -> decltype(actor<composite<operator_adaptor<Eval>, decltype(make_actor(subexpr))>>{
+    {{::cuda::std::move(e)}, make_actor(subexpr)}})
 {
-  using type = typename thrust::detail::functional::apply_actor<thrust::detail::functional::actor<Eval>,
-                                                                thrust::tuple<Arg1, Arg2>>::type;
-}; // end result_of
+  return actor<composite<operator_adaptor<Eval>, decltype(make_actor(subexpr))>>{
+    {{::cuda::std::move(e)}, make_actor(subexpr)}};
+}
+
+template <typename Eval, typename SubExpr1, typename SubExpr2>
+_CCCL_HOST_DEVICE auto compose(Eval e, const SubExpr1& subexpr1, const SubExpr2& subexpr2)
+  -> decltype(actor<composite<operator_adaptor<Eval>, decltype(make_actor(subexpr1)), decltype(make_actor(subexpr2))>>{
+    {{::cuda::std::move(e)}, make_actor(subexpr1), make_actor(subexpr2)}})
+{
+  return actor<composite<operator_adaptor<Eval>, decltype(make_actor(subexpr1)), decltype(make_actor(subexpr2))>>{
+    {{::cuda::std::move(e)}, make_actor(subexpr1), make_actor(subexpr2)}};
+}
+} // namespace functional
 
+template <typename Eval, typename... Args>
+struct result_of_adaptable_function<functional::actor<Eval>(Args...)>
+{
+  using type = decltype(::cuda::std::declval<functional::actor<Eval>>()(::cuda::std::declval<Args>()...));
+};
 } // namespace detail
 THRUST_NAMESPACE_END
-
-#include <thrust/detail/functional/actor.inl>
diff --git a/thrust/thrust/detail/functional/actor.inl b/thrust/thrust/detail/functional/actor.inl
deleted file mode 100644
index 64d367ed15..0000000000
--- a/thrust/thrust/detail/functional/actor.inl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/assignment_operator.h>
-#include <thrust/functional.h>
-#include <thrust/type_traits/logical_metafunctions.h>
-
-#include <type_traits>
-
-THRUST_NAMESPACE_BEGIN
-
-namespace detail
-{
-namespace functional
-{
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<Eval>::actor(const Eval& base)
-    : eval_type(base)
-{}
-
-// actor::operator() needs to construct a tuple of references to its
-// arguments. To make this work with thrust::reference<T>, we need to
-// detect thrust proxy references and store them as T rather than T&.
-// This check ensures that the forwarding references passed into
-// actor::operator() are either:
-// - T&& if and only if T is a thrust::reference<U>, or
-// - T& for any other types.
-// This struct provides a nicer diagnostic for when these conditions aren't
-// met.
-template <typename T>
-using actor_check_ref_type =
-  ::cuda::std::integral_constant<bool,
-                                 (std::is_lvalue_reference<T>::value || thrust::detail::is_wrapped_reference<T>::value)>;
-
-template <typename... Ts>
-using actor_check_ref_types = thrust::conjunction<actor_check_ref_type<Ts>...>;
-
-template <typename Eval>
-template <typename... Ts>
-_CCCL_HOST_DEVICE typename apply_actor<typename actor<Eval>::eval_type, thrust::tuple<eval_ref<Ts>...>>::type
-actor<Eval>::operator()(Ts&&... ts) const
-{
-  static_assert(actor_check_ref_types<Ts...>::value,
-                "Actor evaluations only support rvalue references to "
-                "thrust::reference subclasses.");
-  using tuple_type = thrust::tuple<eval_ref<Ts>...>;
-  return eval_type::eval(tuple_type(THRUST_FWD(ts)...));
-} // end actor<Eval>::operator()
-
-template <typename Eval>
-template <typename T>
-_CCCL_HOST_DEVICE typename assign_result<Eval, T>::type actor<Eval>::operator=(const T& _1) const
-{
-  return do_assign(*this, _1);
-} // end actor::operator=()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/argument.h b/thrust/thrust/detail/functional/argument.h
deleted file mode 100644
index b4fb100e80..0000000000
--- a/thrust/thrust/detail/functional/argument.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/tuple.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <unsigned int i, typename Env>
-struct argument_helper
-{
-  using type = typename thrust::tuple_element<i, Env>::type;
-};
-
-template <unsigned int i>
-struct argument_helper<i, thrust::tuple<>>
-{
-  using type = thrust::tuple<>;
-};
-
-template <unsigned int i>
-class argument
-{
-public:
-  template <typename Env>
-  struct result : argument_helper<i, Env>
-  {};
-
-  _CCCL_HOST_DEVICE constexpr argument() {}
-
-  template <typename Env>
-  _CCCL_HOST_DEVICE typename result<Env>::type eval(const Env& e) const
-  {
-    return thrust::get<i>(e);
-  } // end eval()
-}; // end argument
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/composite.h b/thrust/thrust/detail/functional/composite.h
deleted file mode 100644
index ad4c1c67af..0000000000
--- a/thrust/thrust/detail/functional/composite.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/value.h>
-#include <thrust/tuple.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <typename... Eval>
-class composite;
-
-template <typename Eval0, typename Eval1>
-class composite<Eval0, Eval1>
-{
-public:
-  template <typename Env>
-  struct result
-  {
-    using type = typename Eval0::template result<thrust::tuple<typename Eval1::template result<Env>::type>>::type;
-  };
-
-  _CCCL_HOST_DEVICE composite(const Eval0& e0, const Eval1& e1)
-      : m_eval0(e0)
-      , m_eval1(e1)
-  {}
-
-  template <typename Env>
-  _CCCL_HOST_DEVICE typename result<Env>::type eval(const Env& x) const
-  {
-    typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
-    return m_eval0.eval(thrust::tie(result1));
-  }
-
-private:
-  Eval0 m_eval0;
-  Eval1 m_eval1;
-}; // end composite<Eval0,Eval1>
-
-template <typename Eval0, typename Eval1, typename Eval2>
-class composite<Eval0, Eval1, Eval2>
-{
-public:
-  template <typename Env>
-  struct result
-  {
-    using type = typename Eval0::template result<
-      thrust::tuple<typename Eval1::template result<Env>::type, typename Eval2::template result<Env>::type>>::type;
-  };
-
-  _CCCL_HOST_DEVICE composite(const Eval0& e0, const Eval1& e1, const Eval2& e2)
-      : m_eval0(e0)
-      , m_eval1(e1)
-      , m_eval2(e2)
-  {}
-
-  template <typename Env>
-  _CCCL_HOST_DEVICE typename result<Env>::type eval(const Env& x) const
-  {
-    typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
-    typename Eval2::template result<Env>::type result2 = m_eval2.eval(x);
-    return m_eval0.eval(thrust::tie(result1, result2));
-  }
-
-private:
-  Eval0 m_eval0;
-  Eval1 m_eval1;
-  Eval2 m_eval2;
-}; // end composite<Eval0,Eval1,Eval2>
-
-template <typename Eval0, typename Eval1>
-_CCCL_HOST_DEVICE actor<composite<Eval0, Eval1>> compose(const Eval0& e0, const Eval1& e1)
-{
-  return actor<composite<Eval0, Eval1>>(composite<Eval0, Eval1>(e0, e1));
-}
-
-template <typename Eval0, typename Eval1, typename Eval2>
-_CCCL_HOST_DEVICE actor<composite<Eval0, Eval1, Eval2>> compose(const Eval0& e0, const Eval1& e1, const Eval2& e2)
-{
-  return actor<composite<Eval0, Eval1, Eval2>>(composite<Eval0, Eval1, Eval2>(e0, e1, e2));
-}
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators.h b/thrust/thrust/detail/functional/operators.h
index fe67ab7dd3..94347a82bc 100644
--- a/thrust/thrust/detail/functional/operators.h
+++ b/thrust/thrust/detail/functional/operators.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2024 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,15 @@
  *  limitations under the License.
  */
 
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -25,8 +34,354 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/functional/operators/arithmetic_operators.h>
-#include <thrust/detail/functional/operators/bitwise_operators.h>
-#include <thrust/detail/functional/operators/compound_assignment_operators.h>
-#include <thrust/detail/functional/operators/logical_operators.h>
-#include <thrust/detail/functional/operators/relational_operators.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <thrust/functional.h>
+#include <thrust/tuple.h>
+
+#include <cuda/std/type_traits>
+
+THRUST_NAMESPACE_BEGIN
+namespace detail
+{
+namespace functional
+{
+// there's no standard plus_equal functional, so roll an ad hoc one here
+struct plus_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) += THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) += THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) += THRUST_FWD(t2);
+  }
+};
+
+// there's no standard minus_equal functional, so roll an ad hoc one here
+struct minus_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) -= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) -= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) -= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard multiplies_equal functional, so roll an ad hoc one here
+struct multiplies_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) *= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) *= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) *= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard divides_equal functional, so roll an ad hoc one here
+struct divides_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) /= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) /= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) /= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard modulus_equal functional, so roll an ad hoc one here
+struct modulus_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) %= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) %= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) %= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard bit_and_equal functional, so roll an ad hoc one here
+struct bit_and_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) &= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) &= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) &= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard bit_or_equal functional, so roll an ad hoc one here
+struct bit_or_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) |= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) |= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) |= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard bit_xor_equal functional, so roll an ad hoc one here
+struct bit_xor_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) ^= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) ^= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) ^= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard bit_lshift_equal functional, so roll an ad hoc one here
+struct bit_lshift_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) <<= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) <<= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) <<= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard bit_rshift_equal functional, so roll an ad hoc one here
+struct bit_rshift_equal
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) >>= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >>= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) >>= THRUST_FWD(t2);
+  }
+};
+
+// there's no standard bit_lshift functional, so roll an ad hoc one here
+struct bit_lshift
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) << THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) << THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) << THRUST_FWD(t2);
+  }
+};
+
+// there's no standard bit_rshift functional, so roll an ad hoc one here
+struct bit_rshift
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1& t1, T2&& t2) const
+    noexcept(noexcept(THRUST_FWD(t1) >> THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >> THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) >> THRUST_FWD(t2);
+  }
+};
+
+#define MAKE_BINARY_COMPOSITE(op, functor)                                                                         \
+  template <typename A, typename B, ::cuda::std::__enable_if_t<is_actor<A>::value || is_actor<B>::value, int> = 0> \
+  _CCCL_HOST_DEVICE auto operator op(const A& a, const B& b)->decltype(compose(functor{}, a, b))                   \
+  {                                                                                                                \
+    return compose(functor{}, a, b);                                                                               \
+  }
+
+MAKE_BINARY_COMPOSITE(==, thrust::equal_to<>)
+MAKE_BINARY_COMPOSITE(!=, thrust::not_equal_to<>)
+MAKE_BINARY_COMPOSITE(<, thrust::less<>)
+MAKE_BINARY_COMPOSITE(<=, thrust::less_equal<>)
+MAKE_BINARY_COMPOSITE(>, thrust::greater<>)
+MAKE_BINARY_COMPOSITE(>=, thrust::greater_equal<>)
+
+MAKE_BINARY_COMPOSITE(+, thrust::plus<>)
+MAKE_BINARY_COMPOSITE(-, thrust::minus<>)
+MAKE_BINARY_COMPOSITE(*, thrust::multiplies<>)
+MAKE_BINARY_COMPOSITE(/, thrust::divides<>)
+MAKE_BINARY_COMPOSITE(%, thrust::modulus<>)
+
+MAKE_BINARY_COMPOSITE(+=, plus_equal)
+MAKE_BINARY_COMPOSITE(-=, minus_equal)
+MAKE_BINARY_COMPOSITE(*=, multiplies_equal)
+MAKE_BINARY_COMPOSITE(/=, divides_equal)
+MAKE_BINARY_COMPOSITE(%=, modulus_equal)
+
+MAKE_BINARY_COMPOSITE(&&, thrust::logical_and<>)
+MAKE_BINARY_COMPOSITE(||, thrust::logical_or<>)
+
+MAKE_BINARY_COMPOSITE(&, thrust::bit_and<>)
+MAKE_BINARY_COMPOSITE(|, thrust::bit_or<>)
+MAKE_BINARY_COMPOSITE(^, thrust::bit_xor<>)
+MAKE_BINARY_COMPOSITE(<<, bit_lshift)
+MAKE_BINARY_COMPOSITE(>>, bit_rshift)
+
+MAKE_BINARY_COMPOSITE(&=, bit_and_equal)
+MAKE_BINARY_COMPOSITE(|=, bit_or_equal)
+MAKE_BINARY_COMPOSITE(^=, bit_xor_equal)
+MAKE_BINARY_COMPOSITE(<<=, bit_lshift_equal)
+MAKE_BINARY_COMPOSITE(>>=, bit_rshift_equal)
+
+#undef MAKE_BINARY_COMPOSITE
+
+// there's no standard unary_plus functional, so roll an ad hoc one here
+struct unary_plus
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
+    noexcept(noexcept(+THRUST_FWD(t1))) -> decltype(+THRUST_FWD(t1))
+  {
+    return +THRUST_FWD(t1);
+  }
+};
+
+// there's no standard prefix_increment functional, so roll an ad hoc one here
+struct prefix_increment
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
+    noexcept(noexcept(++THRUST_FWD(t1))) -> decltype(++THRUST_FWD(t1))
+  {
+    return ++THRUST_FWD(t1);
+  }
+}; // end prefix_increment
+
+// there's no standard postfix_increment functional, so roll an ad hoc one here
+struct postfix_increment
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
+    noexcept(noexcept(THRUST_FWD(t1)++)) -> decltype(THRUST_FWD(t1)++)
+  {
+    return THRUST_FWD(t1)++;
+  }
+}; // end postfix_increment
+
+// there's no standard prefix_decrement functional, so roll an ad hoc one here
+struct prefix_decrement
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
+    noexcept(noexcept(--THRUST_FWD(t1))) -> decltype(--THRUST_FWD(t1))
+  {
+    return --THRUST_FWD(t1);
+  }
+}; // end prefix_decrement
+
+// there's no standard postfix_decrement functional, so roll an ad hoc one here
+struct postfix_decrement
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
+    noexcept(noexcept(THRUST_FWD(t1)--)) -> decltype(THRUST_FWD(t1)--)
+  {
+    return THRUST_FWD(t1)--;
+  }
+}; // end prefix_increment
+
+// there's no standard bit_not functional, so roll an ad hoc one here
+struct bit_not
+{
+  using is_transparent = void;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
+    noexcept(noexcept(~THRUST_FWD(t1))) -> decltype(~THRUST_FWD(t1))
+  {
+    return ~THRUST_FWD(t1);
+  }
+}; // end prefix_increment
+
+#define MAKE_UNARY_COMPOSITE(op, functor)                                         \
+  template <typename A, ::cuda::std::__enable_if_t<is_actor<A>::value, int> = 0>  \
+  _CCCL_HOST_DEVICE auto operator op(const A& a)->decltype(compose(functor{}, a)) \
+  {                                                                               \
+    return compose(functor{}, a);                                                 \
+  }
+
+MAKE_UNARY_COMPOSITE(+, unary_plus)
+MAKE_UNARY_COMPOSITE(-, thrust::negate<>)
+MAKE_UNARY_COMPOSITE(++, prefix_increment)
+MAKE_UNARY_COMPOSITE(--, prefix_decrement)
+MAKE_UNARY_COMPOSITE(!, thrust::logical_not<>)
+MAKE_UNARY_COMPOSITE(~, bit_not)
+
+#undef MAKE_UNARY_COMPOSITE
+
+#define MAKE_UNARY_COMPOSITE_POSTFIX(op, functor)                                      \
+  template <typename A, ::cuda::std::__enable_if_t<is_actor<A>::value, int> = 0>       \
+  _CCCL_HOST_DEVICE auto operator op(const A& a, int)->decltype(compose(functor{}, a)) \
+  {                                                                                    \
+    return compose(functor{}, a);                                                      \
+  }
+
+MAKE_UNARY_COMPOSITE_POSTFIX(++, postfix_increment)
+MAKE_UNARY_COMPOSITE_POSTFIX(--, postfix_decrement)
+
+#undef MAKE_UNARY_COMPOSITE_POSTFIX
+
+// there's no standard assign functional, so roll an ad hoc one here
+struct assign
+{
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T1, typename T2>
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
+    THRUST_DECLTYPE_RETURNS(THRUST_FWD(t1) = THRUST_FWD(t2))
+};
+
+template <typename Eval, typename T>
+_CCCL_HOST_DEVICE auto do_assign(const actor<Eval>& _1, const T& _2) -> decltype(compose(assign{}, _1, _2))
+{
+  return compose(assign{}, _1, _2);
+}
+} // namespace functional
+} // namespace detail
+THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators/arithmetic_operators.h b/thrust/thrust/detail/functional/operators/arithmetic_operators.h
deleted file mode 100644
index 024b0e0d95..0000000000
--- a/thrust/thrust/detail/functional/operators/arithmetic_operators.h
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<thrust::negate<>>, actor<Eval>>> _CCCL_HOST_DEVICE
-operator-(const actor<Eval>& _1)
-{
-  return compose(transparent_unary_operator<thrust::negate<>>(), _1);
-} // end operator-()
-
-// there's no standard unary_plus functional, so roll an ad hoc one here
-struct unary_plus
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(+THRUST_FWD(t1))) -> decltype(+THRUST_FWD(t1))
-  {
-    return +THRUST_FWD(t1);
-  }
-};
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<unary_plus>, actor<Eval>>> operator+(const actor<Eval>& _1)
-{
-  return compose(transparent_unary_operator<unary_plus>(), _1);
-} // end operator+()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::plus<>>, actor<T1>, typename as_actor<T2>::type>>
-operator+(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::plus<>>(), make_actor(_1), make_actor(_2));
-} // end operator+()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::plus<>>, typename as_actor<T1>::type, actor<T2>>>
-operator+(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::plus<>>(), make_actor(_1), make_actor(_2));
-} // end operator+()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::plus<>>, actor<T1>, actor<T2>>>
-operator+(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::plus<>>(), make_actor(_1), make_actor(_2));
-} // end operator+()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::minus<>>, typename as_actor<T1>::type, actor<T2>>>
-operator-(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::minus<>>(), make_actor(_1), make_actor(_2));
-} // end operator-()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::minus<>>, actor<T1>, typename as_actor<T2>::type>>
-operator-(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::minus<>>(), make_actor(_1), make_actor(_2));
-} // end operator-()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::minus<>>, actor<T1>, actor<T2>>>
-operator-(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::minus<>>(), make_actor(_1), make_actor(_2));
-} // end operator-()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::multiplies<>>, typename as_actor<T1>::type, actor<T2>>>
-operator*(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::multiplies<>>(), make_actor(_1), make_actor(_2));
-} // end operator*()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::multiplies<>>, actor<T1>, typename as_actor<T2>::type>>
-operator*(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::multiplies<>>(), make_actor(_1), make_actor(_2));
-} // end operator*()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::multiplies<>>, actor<T1>, actor<T2>>>
-operator*(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::multiplies<>>(), make_actor(_1), make_actor(_2));
-} // end operator*()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::divides<>>, actor<T1>, typename as_actor<T2>::type>>
-operator/(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::divides<>>(), make_actor(_1), make_actor(_2));
-} // end operator/()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::divides<>>, typename as_actor<T1>::type, actor<T2>>>
-operator/(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::divides<>>(), make_actor(_1), make_actor(_2));
-} // end operator/()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::divides<>>, actor<T1>, actor<T2>>>
-operator/(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::divides<>>(), make_actor(_1), make_actor(_2));
-} // end operator/()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::modulus<>>, actor<T1>, typename as_actor<T2>::type>>
-operator%(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::modulus<>>(), make_actor(_1), make_actor(_2));
-} // end operator%()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::modulus<>>, typename as_actor<T1>::type, actor<T2>>>
-operator%(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::modulus<void>>(), make_actor(_1), make_actor(_2));
-} // end operator%()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::modulus<>>, actor<T1>, actor<T2>>>
-operator%(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::modulus<>>(), make_actor(_1), make_actor(_2));
-} // end operator%()
-
-// there's no standard prefix_increment functional, so roll an ad hoc one here
-struct prefix_increment
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(++THRUST_FWD(t1))) -> decltype(++THRUST_FWD(t1))
-  {
-    return ++THRUST_FWD(t1);
-  }
-}; // end prefix_increment
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<prefix_increment>, actor<Eval>>>
-operator++(const actor<Eval>& _1)
-{
-  return compose(transparent_unary_operator<prefix_increment>(), _1);
-} // end operator++()
-
-// there's no standard postfix_increment functional, so roll an ad hoc one here
-struct postfix_increment
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(THRUST_FWD(t1)++)) -> decltype(THRUST_FWD(t1)++)
-  {
-    return THRUST_FWD(t1)++;
-  }
-}; // end postfix_increment
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<postfix_increment>, actor<Eval>>>
-operator++(const actor<Eval>& _1, int)
-{
-  return compose(transparent_unary_operator<postfix_increment>(), _1);
-} // end operator++()
-
-// there's no standard prefix_decrement functional, so roll an ad hoc one here
-struct prefix_decrement
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(--THRUST_FWD(t1))) -> decltype(--THRUST_FWD(t1))
-  {
-    return --THRUST_FWD(t1);
-  }
-}; // end prefix_decrement
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<prefix_decrement>, actor<Eval>>>
-operator--(const actor<Eval>& _1)
-{
-  return compose(transparent_unary_operator<prefix_decrement>(), _1);
-} // end operator--()
-
-// there's no standard postfix_decrement functional, so roll an ad hoc one here
-struct postfix_decrement
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(THRUST_FWD(t1)--)) -> decltype(THRUST_FWD(t1)--)
-  {
-    return THRUST_FWD(t1)--;
-  }
-}; // end prefix_increment
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<postfix_decrement>, actor<Eval>>>
-operator--(const actor<Eval>& _1, int)
-{
-  return compose(transparent_unary_operator<postfix_decrement>(), _1);
-} // end operator--()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators/assignment_operator.h b/thrust/thrust/detail/functional/operators/assignment_operator.h
deleted file mode 100644
index 990bc601b7..0000000000
--- a/thrust/thrust/detail/functional/operators/assignment_operator.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-THRUST_NAMESPACE_BEGIN
-
-// XXX WAR circular inclusion with this forward declaration
-template <typename, typename, typename>
-struct binary_function;
-
-namespace detail
-{
-namespace functional
-{
-
-// XXX WAR circular inclusion with this forward declaration
-template <typename>
-struct as_actor;
-
-// there's no standard assign functional, so roll an ad hoc one here
-struct assign
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) = THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) = THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) = THRUST_FWD(t2);
-  }
-};
-
-template <typename Eval, typename T>
-struct assign_result
-{
-  using type = actor<composite<transparent_binary_operator<assign>, actor<Eval>, typename as_actor<T>::type>>;
-}; // end assign_result
-
-template <typename Eval, typename T>
-_CCCL_HOST_DEVICE typename assign_result<Eval, T>::type do_assign(const actor<Eval>& _1, const T& _2)
-{
-  return compose(transparent_binary_operator<assign>(), _1, as_actor<T>::convert(_2));
-} // end do_assign()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators/bitwise_operators.h b/thrust/thrust/detail/functional/operators/bitwise_operators.h
deleted file mode 100644
index c41250a79e..0000000000
--- a/thrust/thrust/detail/functional/operators/bitwise_operators.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_and<>>, actor<T1>, typename as_actor<T2>::type>>
-operator&(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_and<>>(), make_actor(_1), make_actor(_2));
-} // end operator&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_and<>>, typename as_actor<T1>::type, actor<T2>>>
-operator&(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_and<>>(), make_actor(_1), make_actor(_2));
-} // end operator&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_and<>>, actor<T1>, actor<T2>>>
-operator&(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_and<>>(), make_actor(_1), make_actor(_2));
-} // end operator&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_or<>>, actor<T1>, typename as_actor<T2>::type>>
-operator|(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_or<>>(), make_actor(_1), make_actor(_2));
-} // end operator|()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_or<>>, typename as_actor<T1>::type, actor<T2>>>
-operator|(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_or<>>(), make_actor(_1), make_actor(_2));
-} // end operator|()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_or<>>, actor<T1>, actor<T2>>>
-operator|(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_or<>>(), make_actor(_1), make_actor(_2));
-} // end operator|()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_xor<>>, actor<T1>, typename as_actor<T2>::type>>
-operator^(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_xor<>>(), make_actor(_1), make_actor(_2));
-} // end operator^()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_xor<>>, typename as_actor<T1>::type, actor<T2>>>
-operator^(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_xor<>>(), make_actor(_1), make_actor(_2));
-} // end operator^()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_xor<>>, actor<T1>, actor<T2>>>
-operator^(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_xor<>>(), make_actor(_1), make_actor(_2));
-} // end operator^()
-
-// there's no standard bit_not functional, so roll an ad hoc one here
-struct bit_not
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(~THRUST_FWD(t1))) -> decltype(~THRUST_FWD(t1))
-  {
-    return ~THRUST_FWD(t1);
-  }
-}; // end prefix_increment
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<bit_not>, actor<Eval>>> _CCCL_HOST_DEVICE
-operator~(const actor<Eval>& _1)
-{
-  return compose(transparent_unary_operator<bit_not>(), _1);
-} // end operator~()
-
-// there's no standard bit_lshift functional, so roll an ad hoc one here
-struct bit_lshift
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) << THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) << THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) << THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_lshift>, actor<T1>, typename as_actor<T2>::type>>
-operator<<(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_lshift>(), make_actor(_1), make_actor(_2));
-} // end operator<<()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_lshift>, typename as_actor<T1>::type, actor<T2>>>
-operator<<(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_lshift>(), make_actor(_1), make_actor(_2));
-} // end operator<<()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_lshift>, actor<T1>, actor<T2>>>
-operator<<(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_lshift>(), make_actor(_1), make_actor(_2));
-} // end operator<<()
-
-// there's no standard bit_rshift functional, so roll an ad hoc one here
-struct bit_rshift
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) >> THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >> THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) >> THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_rshift>, actor<T1>, typename as_actor<T2>::type>>
-operator>>(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_rshift>(), make_actor(_1), make_actor(_2));
-} // end operator>>()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_rshift>, typename as_actor<T1>::type, actor<T2>>>
-operator>>(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_rshift>(), make_actor(_1), make_actor(_2));
-} // end operator>>()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_rshift>, actor<T1>, actor<T2>>>
-operator>>(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_rshift>(), make_actor(_1), make_actor(_2));
-} // end operator>>()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators/compound_assignment_operators.h b/thrust/thrust/detail/functional/operators/compound_assignment_operators.h
deleted file mode 100644
index 5163ba5cc9..0000000000
--- a/thrust/thrust/detail/functional/operators/compound_assignment_operators.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-// there's no standard plus_equal functional, so roll an ad hoc one here
-struct plus_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) += THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) += THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) += THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<plus_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator+=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<plus_equal>(), make_actor(_1), make_actor(_2));
-} // end operator+=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<plus_equal>, actor<T1>, actor<T2>>>
-operator+=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<plus_equal>(), make_actor(_1), make_actor(_2));
-} // end operator+=()
-
-// there's no standard minus_equal functional, so roll an ad hoc one here
-struct minus_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) -= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) -= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) -= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<minus_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator-=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<minus_equal>(), make_actor(_1), make_actor(_2));
-} // end operator-=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<minus_equal>, actor<T1>, actor<T2>>>
-operator-=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<minus_equal>(), make_actor(_1), make_actor(_2));
-} // end operator-=()
-
-// there's no standard multiplies_equal functional, so roll an ad hoc one here
-struct multiplies_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) *= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) *= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) *= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<multiplies_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator*=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<multiplies_equal>(), make_actor(_1), make_actor(_2));
-} // end operator*=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<multiplies_equal>, actor<T1>, actor<T2>>>
-operator*=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<multiplies_equal>(), make_actor(_1), make_actor(_2));
-} // end operator*=()
-
-// there's no standard divides_equal functional, so roll an ad hoc one here
-struct divides_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) /= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) /= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) /= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<divides_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator/=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<divides_equal>(), make_actor(_1), make_actor(_2));
-} // end operator/=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<divides_equal>, actor<T1>, actor<T2>>>
-operator/=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<divides_equal>(), make_actor(_1), make_actor(_2));
-} // end operator/=()
-
-// there's no standard modulus_equal functional, so roll an ad hoc one here
-struct modulus_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) %= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) %= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) %= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<modulus_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator%=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<modulus_equal>(), make_actor(_1), make_actor(_2));
-} // end operator%=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<modulus_equal>, actor<T1>, actor<T2>>>
-operator%=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<modulus_equal>(), make_actor(_1), make_actor(_2));
-} // end operator%=()
-
-// there's no standard bit_and_equal functional, so roll an ad hoc one here
-struct bit_and_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) &= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) &= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) &= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_and_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator&=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_and_equal>(), make_actor(_1), make_actor(_2));
-} // end operator&=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_and_equal>, actor<T1>, actor<T2>>>
-operator&=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_and_equal>(), make_actor(_1), make_actor(_2));
-} // end operator&=()
-
-// there's no standard bit_or_equal functional, so roll an ad hoc one here
-struct bit_or_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) |= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) |= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) |= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_or_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator|=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_or_equal>(), make_actor(_1), make_actor(_2));
-} // end operator|=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_or_equal>, actor<T1>, actor<T2>>>
-operator|=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_or_equal>(), make_actor(_1), make_actor(_2));
-} // end operator|=()
-
-// there's no standard bit_xor_equal functional, so roll an ad hoc one here
-struct bit_xor_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) ^= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) ^= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) ^= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_xor_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator^=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_xor_equal>(), make_actor(_1), make_actor(_2));
-} // end operator|=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_xor_equal>, actor<T1>, actor<T2>>>
-operator^=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_xor_equal>(), make_actor(_1), make_actor(_2));
-} // end operator|=()
-
-// there's no standard bit_lshift_equal functional, so roll an ad hoc one here
-struct bit_lshift_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) <<= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) <<= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) <<= THRUST_FWD(t2);
-  }
-};
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_lshift_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator<<=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_lshift_equal>(), make_actor(_1), make_actor(_2));
-} // end operator<<=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_lshift_equal>, actor<T1>, actor<T2>>>
-operator<<=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_lshift_equal>(), make_actor(_1), make_actor(_2));
-} // end operator<<=()
-
-// there's no standard bit_rshift_equal functional, so roll an ad hoc one here
-struct bit_rshift_equal
-{
-  using is_transparent = void;
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t1) >>= THRUST_FWD(t2))) -> decltype(THRUST_FWD(t1) >>= THRUST_FWD(t2))
-  {
-    return THRUST_FWD(t1) >>= THRUST_FWD(t2);
-  }
-};
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_rshift_equal>, actor<T1>, typename as_actor<T2>::type>>
-operator>>=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<bit_rshift_equal>(), make_actor(_1), make_actor(_2));
-} // end operator>>=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<bit_rshift_equal>, actor<T1>, actor<T2>>>
-operator>>=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<bit_rshift_equal>(), make_actor(_1), make_actor(_2));
-} // end operator>>=()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators/logical_operators.h b/thrust/thrust/detail/functional/operators/logical_operators.h
deleted file mode 100644
index 75ed46cc96..0000000000
--- a/thrust/thrust/detail/functional/operators/logical_operators.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::logical_and<>>, actor<T1>, typename as_actor<T2>::type>>
-operator&&(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::logical_and<>>(), make_actor(_1), make_actor(_2));
-} // end operator&&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::logical_and<>>, typename as_actor<T1>::type, actor<T2>>>
-operator&&(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::logical_and<>>(), make_actor(_1), make_actor(_2));
-} // end operator&&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::logical_and<>>, actor<T1>, actor<T2>>>
-operator&&(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::logical_and<>>(), make_actor(_1), make_actor(_2));
-} // end operator&&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::logical_or<>>, actor<T1>, typename as_actor<T2>::type>>
-operator||(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::logical_or<>>(), make_actor(_1), make_actor(_2));
-} // end operator&&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::logical_or<>>, typename as_actor<T1>::type, actor<T2>>>
-operator||(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::logical_or<>>(), make_actor(_1), make_actor(_2));
-} // end operator&&()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::logical_or<>>, actor<T1>, actor<T2>>>
-operator||(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::logical_or<>>(), make_actor(_1), make_actor(_2));
-} // end operator&&()
-
-template <typename Eval>
-_CCCL_HOST_DEVICE actor<composite<transparent_unary_operator<thrust::logical_not<>>, actor<Eval>>>
-operator!(const actor<Eval>& _1)
-{
-  return compose(transparent_unary_operator<thrust::logical_not<>>(), _1);
-} // end operator!()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators/operator_adaptors.h b/thrust/thrust/detail/functional/operators/operator_adaptors.h
deleted file mode 100644
index 31587ac734..0000000000
--- a/thrust/thrust/detail/functional/operators/operator_adaptors.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/argument.h>
-#include <thrust/detail/type_deduction.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/tuple.h>
-
-#include <type_traits>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-// Adapts a transparent unary functor from functional.h (e.g. thrust::negate<>)
-// into the Eval interface.
-template <typename UnaryFunctor>
-struct transparent_unary_operator
-{
-  template <typename>
-  using operator_type = UnaryFunctor;
-
-  template <typename Env>
-  using argument =
-    typename thrust::detail::eval_if<thrust::tuple_size<Env>::value != 1,
-                                     thrust::detail::identity_<thrust::tuple<>>,
-                                     thrust::detail::functional::argument_helper<0, Env>>::type;
-
-  template <typename Env>
-  struct result_type_impl
-  {
-    using type = decltype(std::declval<UnaryFunctor>()(std::declval<argument<Env>>()));
-  };
-
-  template <typename Env>
-  using result_type =
-    typename thrust::detail::eval_if<std::is_same<thrust::tuple<>, argument<Env>>::value,
-                                     thrust::detail::identity_<thrust::tuple<>>,
-                                     result_type_impl<Env>>::type;
-
-  template <typename Env>
-  struct result
-  {
-    using op_type = UnaryFunctor;
-    using type    = result_type<Env>;
-  };
-
-  template <typename Env>
-  _CCCL_HOST_DEVICE result_type<Env> eval(Env&& e) const THRUST_RETURNS(UnaryFunctor{}(thrust::get<0>(THRUST_FWD(e))))
-};
-
-// Adapts a transparent binary functor from functional.h (e.g. thrust::less<>)
-// into the Eval interface.
-template <typename BinaryFunctor>
-struct transparent_binary_operator
-{
-  template <typename>
-  using operator_type = BinaryFunctor;
-
-  template <typename Env>
-  using first_argument =
-    typename thrust::detail::eval_if<thrust::tuple_size<Env>::value != 2,
-                                     thrust::detail::identity_<thrust::tuple<>>,
-                                     thrust::detail::functional::argument_helper<0, Env>>::type;
-
-  template <typename Env>
-  using second_argument =
-    typename thrust::detail::eval_if<thrust::tuple_size<Env>::value != 2,
-                                     thrust::detail::identity_<thrust::tuple<>>,
-                                     thrust::detail::functional::argument_helper<1, Env>>::type;
-
-  template <typename Env>
-  struct result_type_impl
-  {
-    using type = decltype(std::declval<BinaryFunctor>()(
-      std::declval<first_argument<Env>>(), std::declval<second_argument<Env>>()));
-  };
-
-  template <typename Env>
-  using result_type =
-    typename thrust::detail::eval_if<(std::is_same<thrust::tuple<>, first_argument<Env>>::value
-                                      || std::is_same<thrust::tuple<>, second_argument<Env>>::value),
-                                     thrust::detail::identity_<thrust::tuple<>>,
-                                     result_type_impl<Env>>::type;
-
-  template <typename Env>
-  struct result
-  {
-    using op_type = BinaryFunctor;
-    using type    = result_type<Env>;
-  };
-
-  template <typename Env>
-  _CCCL_HOST_DEVICE result_type<Env> eval(Env&& e) const
-    THRUST_RETURNS(BinaryFunctor{}(thrust::get<0>(e), thrust::get<1>(e)))
-};
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/operators/relational_operators.h b/thrust/thrust/detail/functional/operators/relational_operators.h
deleted file mode 100644
index d58c2fb67f..0000000000
--- a/thrust/thrust/detail/functional/operators/relational_operators.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::equal_to<>>, actor<T1>, typename as_actor<T2>::type>>
-operator==(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::equal_to<>>(), make_actor(_1), make_actor(_2));
-} // end operator==()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::equal_to<>>, typename as_actor<T1>::type, actor<T2>>>
-operator==(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::equal_to<>>(), make_actor(_1), make_actor(_2));
-} // end operator==()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::equal_to<>>, actor<T1>, actor<T2>>>
-operator==(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::equal_to<>>(), make_actor(_1), make_actor(_2));
-} // end operator==()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::not_equal_to<>>, actor<T1>, typename as_actor<T2>::type>>
-operator!=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::not_equal_to<>>(), make_actor(_1), make_actor(_2));
-} // end operator!=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::not_equal_to<>>, typename as_actor<T1>::type, actor<T2>>>
-operator!=(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::not_equal_to<>>(), make_actor(_1), make_actor(_2));
-} // end operator!=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::not_equal_to<>>, actor<T1>, actor<T2>>>
-operator!=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::not_equal_to<>>(), make_actor(_1), make_actor(_2));
-} // end operator!=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::greater<>>, actor<T1>, typename as_actor<T2>::type>>
-operator>(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::greater<>>(), make_actor(_1), make_actor(_2));
-} // end operator>()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::greater<>>, typename as_actor<T1>::type, actor<T2>>>
-operator>(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::greater<>>(), make_actor(_1), make_actor(_2));
-} // end operator>()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::greater<>>, actor<T1>, actor<T2>>>
-operator>(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::greater<>>(), make_actor(_1), make_actor(_2));
-} // end operator>()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::less<>>, actor<T1>, typename as_actor<T2>::type>>
-operator<(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::less<>>(), make_actor(_1), make_actor(_2));
-} // end operator<()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::less<>>, typename as_actor<T1>::type, actor<T2>>>
-operator<(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::less<>>(), make_actor(_1), make_actor(_2));
-} // end operator<()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::less<>>, actor<T1>, actor<T2>>>
-operator<(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::less<>>(), make_actor(_1), make_actor(_2));
-} // end operator<()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::greater_equal<>>, actor<T1>, typename as_actor<T2>::type>>
-operator>=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::greater_equal<>>(), make_actor(_1), make_actor(_2));
-} // end operator>=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::greater_equal<>>, typename as_actor<T1>::type, actor<T2>>>
-operator>=(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::greater_equal<>>(), make_actor(_1), make_actor(_2));
-} // end operator>=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::greater_equal<>>, actor<T1>, actor<T2>>>
-operator>=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::greater_equal<>>(), make_actor(_1), make_actor(_2));
-} // end operator>=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::less_equal<>>, actor<T1>, typename as_actor<T2>::type>>
-operator<=(const actor<T1>& _1, const T2& _2)
-{
-  return compose(transparent_binary_operator<thrust::less_equal<>>(), make_actor(_1), make_actor(_2));
-} // end operator<=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE
-actor<composite<transparent_binary_operator<thrust::less_equal<>>, typename as_actor<T1>::type, actor<T2>>>
-operator<=(const T1& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::less_equal<>>(), make_actor(_1), make_actor(_2));
-} // end operator<=()
-
-template <typename T1, typename T2>
-_CCCL_HOST_DEVICE actor<composite<transparent_binary_operator<thrust::less_equal<>>, actor<T1>, actor<T2>>>
-operator<=(const actor<T1>& _1, const actor<T2>& _2)
-{
-  return compose(transparent_binary_operator<thrust::less_equal<>>(), make_actor(_1), make_actor(_2));
-} // end operator<=()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/placeholder.h b/thrust/thrust/detail/functional/placeholder.h
deleted file mode 100644
index a95d4d506a..0000000000
--- a/thrust/thrust/detail/functional/placeholder.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/argument.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <unsigned int i>
-struct placeholder
-{
-  using type = actor<argument<i>>;
-};
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/functional/value.h b/thrust/thrust/detail/functional/value.h
deleted file mode 100644
index e2ce136b04..0000000000
--- a/thrust/thrust/detail/functional/value.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/detail/functional/actor.h>
-
-THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-namespace functional
-{
-
-template <typename Eval>
-struct actor;
-
-template <typename T>
-class value
-{
-public:
-  template <typename Env>
-  struct result
-  {
-    using type = T;
-  };
-
-  _CCCL_HOST_DEVICE value(const T& arg)
-      : m_val(arg)
-  {}
-
-  template <typename Env>
-  _CCCL_HOST_DEVICE T eval(const Env&) const
-  {
-    return m_val;
-  }
-
-private:
-  T m_val;
-}; // end value
-
-template <typename T>
-_CCCL_HOST_DEVICE actor<value<T>> val(const T& x)
-{
-  return value<T>(x);
-} // end val()
-
-} // namespace functional
-} // namespace detail
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/functional.h b/thrust/thrust/functional.h
index 4b88f46954..9c8d8d2938 100644
--- a/thrust/thrust/functional.h
+++ b/thrust/thrust/functional.h
@@ -29,7 +29,7 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/functional/placeholder.h>
+#include <thrust/detail/functional/actor.h>
 
 #include <functional>
 

From 352638b4125af488608ed032e8d9652b5080eb23 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 14 Aug 2024 13:45:19 +0200
Subject: [PATCH 23/33] Add missing annotations for deprecated debug_sync APIs
 (#2212)

---
 cub/cub/device/dispatch/dispatch_histogram.cuh | 1 +
 cub/cub/device/dispatch/dispatch_rle.cuh       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index c623cda9a2..aa8cc2f5c0 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -1593,6 +1593,7 @@ public:
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 917b5df37b..2ca3527b60 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -544,6 +544,7 @@ struct DeviceRleDispatch
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
     size_t& temp_storage_bytes,

From dded5f1ac6c48c71215c70835f9ed0babaad4a3a Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 14 Aug 2024 13:56:43 +0200
Subject: [PATCH 24/33] Test thrust headers for disabled half/bf16 support
 (#2219)

---
 cub/cmake/CubHeaderTesting.cmake       |  2 ++
 thrust/cmake/ThrustHeaderTesting.cmake | 37 ++++++++++++++++++++------
 thrust/cmake/header_test.in            | 15 +++++++++++
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/cub/cmake/CubHeaderTesting.cmake b/cub/cmake/CubHeaderTesting.cmake
index f0ca17186c..fdf9be3be4 100644
--- a/cub/cmake/CubHeaderTesting.cmake
+++ b/cub/cmake/CubHeaderTesting.cmake
@@ -42,12 +42,14 @@ set(header_definitions
   "CUB_WRAPPED_NAMESPACE=wrapped_cub")
 cub_add_header_test(base "${header_definitions}")
 
+# Check that BF16 support can be disabled
 set(header_definitions
   "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
   "CUB_WRAPPED_NAMESPACE=wrapped_cub"
   "CCCL_DISABLE_BF16_SUPPORT")
 cub_add_header_test(bf16 "${header_definitions}")
 
+# Check that half support can be disabled
 set(header_definitions
   "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
   "CUB_WRAPPED_NAMESPACE=wrapped_cub"
diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake
index ad438b0f87..4c1d07f744 100644
--- a/thrust/cmake/ThrustHeaderTesting.cmake
+++ b/thrust/cmake/ThrustHeaderTesting.cmake
@@ -7,7 +7,7 @@
 # Meta target for all configs' header builds:
 add_custom_target(thrust.all.headers)
 
-foreach(thrust_target IN LISTS THRUST_TARGETS)
+function(thrust_add_header_test thrust_target label definitions)
   thrust_get_target_property(config_host ${thrust_target} HOST)
   thrust_get_target_property(config_device ${thrust_target} DEVICE)
   thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
@@ -115,14 +115,10 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
     list(APPEND headertest_srcs "${headertest_src}")
   endforeach()
 
-  set(headertest_target ${config_prefix}.headers)
+  set(headertest_target ${config_prefix}.headers.${label})
   add_library(${headertest_target} OBJECT ${headertest_srcs})
   target_link_libraries(${headertest_target} PUBLIC ${thrust_target})
-  # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
-  target_compile_definitions(${headertest_target} PRIVATE
-    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
-    "CUB_WRAPPED_NAMESPACE=wrapped_cub"
-  )
+  target_compile_definitions(${headertest_target} PRIVATE ${header_definitions})
   thrust_clone_target_properties(${headertest_target} ${thrust_target})
 
   if ("CUDA" STREQUAL "${config_device}")
@@ -141,4 +137,29 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
 
   add_dependencies(thrust.all.headers ${headertest_target})
   add_dependencies(${config_prefix}.all ${headertest_target})
-endforeach()
+endfunction()
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
+  set(header_definitions
+    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+    "CUB_WRAPPED_NAMESPACE=wrapped_cub")
+  thrust_add_header_test(${thrust_target} base "${header_definitions}")
+
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if ("CUDA" STREQUAL "${config_device}")
+    # Check that BF16 support can be disabled
+    set(header_definitions
+      "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+      "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+      "CCCL_DISABLE_BF16_SUPPORT")
+    thrust_add_header_test(${thrust_target} bf16 "${header_definitions}")
+
+    # Check that half support can be disabled
+    set(header_definitions
+      "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+      "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+      "CCCL_DISABLE_FP16_SUPPORT")
+    thrust_add_header_test(${thrust_target} half "${header_definitions}")
+  endif()
+endforeach ()
diff --git a/thrust/cmake/header_test.in b/thrust/cmake/header_test.in
index 59e44e03c1..236cb9bde4 100644
--- a/thrust/cmake/header_test.in
+++ b/thrust/cmake/header_test.in
@@ -64,3 +64,18 @@
 #endif // THRUST_IGNORE_MACRO_CHECKS
 
 #include <thrust/${header}>
+
+#if defined(CCCL_DISABLE_BF16_SUPPORT)
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+#error Thrust should not include cuda_bf16.h when BF16 support is disabled
+#endif // __CUDA_BF16_TYPES_EXIST__
+#endif // CCCL_DISABLE_BF16_SUPPORT
+
+#if defined(CCCL_DISABLE_FP16_SUPPORT)
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+#error Thrust should not include cuda_fp16.h when half support is disabled
+#endif // __CUDA_FP16_TYPES_EXIST__
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+#error Thrust should not include cuda_bf16.h when half support is disabled
+#endif // __CUDA_BF16_TYPES_EXIST__
+#endif // CCCL_DISABLE_FP16_SUPPORT

From 1981c4972c0fc95b4180c16cf3b39f3fd87c1c25 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 14 Aug 2024 15:00:35 +0200
Subject: [PATCH 25/33] Make cuda::std::max constexpr in C++11 (#2107)

---
 libcudacxx/include/cuda/std/__algorithm/comp.h                | 2 +-
 libcudacxx/include/cuda/std/__algorithm/max.h                 | 4 ++--
 .../std/algorithms/alg.sorting/alg.min.max/max.pass.cpp       | 4 ++++
 .../std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp  | 4 ++++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__algorithm/comp.h b/libcudacxx/include/cuda/std/__algorithm/comp.h
index 5427fc7e16..2e5c81ed45 100644
--- a/libcudacxx/include/cuda/std/__algorithm/comp.h
+++ b/libcudacxx/include/cuda/std/__algorithm/comp.h
@@ -46,7 +46,7 @@ struct __is_trivial_equality_predicate<__equal_to, _Lhs, _Rhs> : true_type
 struct __less
 {
   template <class _Tp, class _Up>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
   operator()(const _Tp& __lhs, const _Up& __rhs) const noexcept(noexcept(__lhs < __rhs))
   {
     return __lhs < __rhs;
diff --git a/libcudacxx/include/cuda/std/__algorithm/max.h b/libcudacxx/include/cuda/std/__algorithm/max.h
index 4fec573393..28677d6b7a 100644
--- a/libcudacxx/include/cuda/std/__algorithm/max.h
+++ b/libcudacxx/include/cuda/std/__algorithm/max.h
@@ -30,14 +30,14 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp&
+_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp&
 max(const _Tp& __a, const _Tp& __b, _Compare __comp)
 {
   return __comp(__a, __b) ? __b : __a;
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& max(const _Tp& __a, const _Tp& __b)
+_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& max(const _Tp& __a, const _Tp& __b)
 {
   return _CUDA_VSTD::max(__a, __b, __less{});
 }
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp
index 15631df845..c1b6ef02de 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max.pass.cpp
@@ -53,6 +53,10 @@ int main(int, char**)
   test();
 #if TEST_STD_VER >= 2014
   static_assert(test(), "");
+#else // TEST_STD_VER >= 2014
+  constexpr int x = 0;
+  constexpr int y = 1;
+  static_assert(&cuda::std::max(x, y) == &y, "");
 #endif // TEST_STD_VER >= 2014
 
   return 0;
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp
index d24b1ffe30..526f692f08 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/max_comp.pass.cpp
@@ -55,6 +55,10 @@ int main(int, char**)
   test();
 #if TEST_STD_VER >= 2014
   static_assert(test(), "");
+#else // TEST_STD_VER >= 2014
+  constexpr int x = 0;
+  constexpr int y = 1;
+  static_assert(&cuda::std::max(x, y, cuda::std::greater<int>()) == &x, "");
 #endif // TEST_STD_VER >= 2014
 
   return 0;

From 73df2b0eecab0f3d7b2693aff590c7ef139e51bf Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 14 Aug 2024 15:20:11 +0200
Subject: [PATCH 26/33] Fix ForEachCopyN for non-contiguous iterators (#2220)

By falling back to a non-load-vectorizing code path.

Fixes: #2207
---
 cub/cub/device/device_for.cuh           | 28 +++++++------------------
 cub/test/catch2_test_device_for.cu      | 21 +++++++++++++++++++
 cub/test/catch2_test_device_for_copy.cu | 21 +++++++++++++++++++
 3 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index 5384748942..0e0bcaa36c 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -145,13 +145,12 @@ private:
     return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(num_items, wrapped_op_t{first, op}, stream);
   }
 
-  template <class RandomAccessIteratorT, class OffsetT, class OpT>
+  template <class ContiguousIteratorT, class OffsetT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
-    RandomAccessIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */)
+    ContiguousIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */)
   {
-    auto unwrapped_first = THRUST_NS_QUALIFIER::raw_pointer_cast(&*first);
-    using wrapped_op_t =
-      detail::for_each::op_wrapper_vectorized_t<OffsetT, OpT, detail::value_t<RandomAccessIteratorT>>;
+    auto* unwrapped_first = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(first);
+    using wrapped_op_t = detail::for_each::op_wrapper_vectorized_t<OffsetT, OpT, detail::value_t<ContiguousIteratorT>>;
 
     if (is_aligned<typename wrapped_op_t::vector_t>(unwrapped_first))
     { // Vectorize loads
@@ -587,14 +586,12 @@ private:
   CUB_RUNTIME_FUNCTION static cudaError_t
   ForEachNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
   {
-    using offset_t            = NumItemsT;
-    using use_vectorization_t = ::cuda::std::integral_constant<bool, false>;
-
+    using offset_t = NumItemsT;
     // Disable auto-vectorization for now:
     // constexpr bool use_vectorization =
     //   detail::for_each::can_regain_copy_freedom<detail::value_t<RandomAccessIteratorT>, OpT>::value
     //   && THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value;
-
+    using use_vectorization_t = ::cuda::std::bool_constant<false>;
     return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
   }
 
@@ -717,12 +714,8 @@ private:
   CUB_RUNTIME_FUNCTION static cudaError_t
   ForEachCopyNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
   {
-    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value,
-                  "Iterator must be contiguous");
-
     using offset_t            = NumItemsT;
-    using use_vectorization_t = ::cuda::std::integral_constant<bool, true>;
-
+    using use_vectorization_t = THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>;
     return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
   }
 
@@ -837,13 +830,8 @@ public:
   ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopy");
-    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value,
-                  "Iterator must be contiguous");
-
-    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
-
+    using offset_t       = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
     const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
-
     return ForEachCopyNNoNVTX(first, num_items, op, stream);
   }
 };
diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu
index 62ccfec02c..e54eb3ebc8 100644
--- a/cub/test/catch2_test_device_for.cu
+++ b/cub/test/catch2_test_device_for.cu
@@ -29,6 +29,7 @@
 // above header needs to be included first
 
 #include <cub/device/device_for.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
 
 #include <thrust/count.h>
 #include <thrust/detail/raw_pointer_cast.h>
@@ -246,3 +247,23 @@ CUB_TEST("Device for each n works with unaligned vectors", "[for][device]", offs
 
   REQUIRE(num_of_once_marked_items == num_items);
 }
+
+CUB_TEST("Device for each works with couting iterator", "[for][device]")
+{
+  using offset_t               = int;
+  constexpr offset_t max_items = 5000000;
+  constexpr offset_t min_items = 1;
+  const offset_t num_items     = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const auto it = cub::CountingInputIterator<int>{0};
+  c2h::device_vector<int> counts(num_items);
+  device_for_each(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu
index 2263b3987e..fdb117eff6 100644
--- a/cub/test/catch2_test_device_for_copy.cu
+++ b/cub/test/catch2_test_device_for_copy.cu
@@ -29,6 +29,7 @@
 // above header needs to be included first
 
 #include <cub/device/device_for.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
 
 #include <thrust/count.h>
 #include <thrust/detail/raw_pointer_cast.h>
@@ -186,3 +187,23 @@ CUB_TEST("Device for each n works with unaligned vectors", "[for_copy][device]",
 
   REQUIRE(num_of_once_marked_items == num_items);
 }
+
+CUB_TEST("Device for each works with couting iterator", "[for][device]")
+{
+  using offset_t               = int;
+  constexpr offset_t max_items = 5000000;
+  constexpr offset_t min_items = 1;
+  const offset_t num_items     = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const auto it = cub::CountingInputIterator<int>{0};
+  c2h::device_vector<int> counts(num_items);
+  device_for_each_copy(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+  REQUIRE(num_of_once_marked_items == num_items);
+}

From cbce14b74a25abd2fcee5581d52b655e6f75e24b Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 14 Aug 2024 18:44:48 +0200
Subject: [PATCH 27/33] Configure CUB/Thrust for C++17 by default (#2217)

---
 cub/cmake/CubBuildTargetList.cmake   | 10 +++++-----
 thrust/cmake/ThrustMultiConfig.cmake |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cub/cmake/CubBuildTargetList.cmake b/cub/cmake/CubBuildTargetList.cmake
index 5277f59e99..2a0827a894 100644
--- a/cub/cmake/CubBuildTargetList.cmake
+++ b/cub/cmake/CubBuildTargetList.cmake
@@ -40,12 +40,12 @@ set(CUB_CPP_DIALECT_OPTIONS
 )
 
 define_property(TARGET PROPERTY _CUB_DIALECT
-  BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17."
-  FULL_DOCS "A target's C++ dialect: 11, 14, or 17."
+  BRIEF_DOCS "A target's C++ dialect: 11, 14, 17 or 20."
+  FULL_DOCS "A target's C++ dialect: 11, 14, 17 or 20."
 )
 define_property(TARGET PROPERTY _CUB_PREFIX
-  BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp14'."
-  FULL_DOCS "A prefix describing the config, eg. 'cub.cpp14'."
+  BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp17'."
+  FULL_DOCS "A prefix describing the config, eg. 'cub.cpp17'."
 )
 
 function(cub_set_target_properties target_name dialect prefix)
@@ -134,7 +134,7 @@ function(cub_build_target_list)
   foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS)
     # Create CMake options:
     set(default_value OFF)
-    if (dialect EQUAL 14) # Default to just 14 on:
+    if (dialect EQUAL 17) # Default to just 17 on:
       set(default_value ON)
     endif()
     option(CUB_ENABLE_DIALECT_CPP${dialect}
diff --git a/thrust/cmake/ThrustMultiConfig.cmake b/thrust/cmake/ThrustMultiConfig.cmake
index aa9fc02266..46bffc761c 100644
--- a/thrust/cmake/ThrustMultiConfig.cmake
+++ b/thrust/cmake/ThrustMultiConfig.cmake
@@ -15,7 +15,7 @@ function(thrust_configure_multiconfig)
     # Handle dialect options:
     foreach (dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS)
       set(default_value OFF)
-      if (dialect EQUAL 14) # Default to just 14 on:
+      if (dialect EQUAL 17) # Default to just 17 on:
         set(default_value ON)
       endif()
       option(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}
@@ -112,7 +112,7 @@ function(thrust_configure_multiconfig)
       set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE STRING)
     endif()
 
-    set(THRUST_CPP_DIALECT 14
+    set(THRUST_CPP_DIALECT 17
       CACHE STRING "The C++ standard to target: ${THRUST_CPP_DIALECT_OPTIONS}"
     )
     set_property(CACHE THRUST_CPP_DIALECT

From e42341248f5340bf46430e7af3987ec98011afe6 Mon Sep 17 00:00:00 2001
From: Stephen Nicholas Swatman <stephenswat@gmail.com>
Date: Thu, 15 Aug 2024 15:15:57 +0200
Subject: [PATCH 28/33] Allow installing components when downstream (#2096)

In the @acts-project we adopt an (admittedly somewhat unconventional)
build system in which software A depends on B, and B depends on CCCL.
The setup is that we want to install B into a prefix, and then try to
build A against B. The problem arises is that we are using CMake to
dynamically fetch CCCL using the so-called "FetchContent" mechanism,
which downloads CCCL and then adds it as a subdirectory.

The core problem is that installing software B which has included CCCL
does not actually install CCCL in the same prefix, so software A cannot
then load software B as CCCL is not installed. The reason this happens
is that CMakeLists.txt:28 (at the time of writing) returns from the
CMake configuration stage early, and leaves the CUB, Thrust, and
libcudacxx directories unincluded (see lines 70 to 72).

Although this is, again, an unconventional and rare scenario, it should
be easy to add support for this kind of build, and I hope the CCCL devs
would agree that it might be worth doing. In this commit, I remove the
early return and replace it with additional if-statements. This commit
should leave any existing workflows completely untouched, but should
make it easier to use CCCL in the way we do in @acts-project.
---
 CMakeLists.txt | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d1ae64e2f5..198727dc5d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,17 +25,18 @@ endif()
 # Support adding CCCL to a parent project via add_subdirectory.
 if (NOT CCCL_TOPLEVEL_PROJECT)
   include(cmake/CCCLAddSubdir.cmake)
-  return()
 endif()
 
 # We require a higher cmake version for dev builds
-cmake_minimum_required(VERSION 3.21)
+if (CCCL_TOPLEVEL_PROJECT)
+  cmake_minimum_required(VERSION 3.21)
+endif()
 
-option(CCCL_ENABLE_LIBCUDACXX "Enable the libcu++ developer build." ON)
-option(CCCL_ENABLE_CUB "Enable the CUB developer build." ON)
-option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ON)
-option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ON)
-option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ON)
+option(CCCL_ENABLE_LIBCUDACXX "Enable the libcu++ developer build." ${CCCL_TOPLEVEL_PROJECT})
+option(CCCL_ENABLE_CUB "Enable the CUB developer build." ${CCCL_TOPLEVEL_PROJECT})
+option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ${CCCL_TOPLEVEL_PROJECT})
+option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ${CCCL_TOPLEVEL_PROJECT})
+option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ${CCCL_TOPLEVEL_PROJECT})
 option(CCCL_ENABLE_BENCHMARKS "Enable CUDA C++ Core Library benchmarks." OFF)
 
 option(CCCL_ENABLE_UNSTABLE "Enable targets and developer build options for unstable projects." OFF)
@@ -44,27 +45,28 @@ if (CCCL_ENABLE_UNSTABLE)
   option(CCCL_ENABLE_CUDAX "Enable the CUDA Experimental developer build." ON)
 endif()
 
-
 include(CTest)
 enable_testing()
 
-include(cmake/CCCLUtilities.cmake) # include this first
-include(cmake/CCCLClangdCompileInfo.cmake)
+if (CCCL_TOPLEVEL_PROJECT)
+  include(cmake/CCCLUtilities.cmake) # include this first
+  include(cmake/CCCLClangdCompileInfo.cmake)
+endif()
 
 if (CCCL_ENABLE_LIBCUDACXX)
-  set(LIBCUDACXX_TOPLEVEL_PROJECT ON)
+  set(LIBCUDACXX_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT})
 endif()
 
 if (CCCL_ENABLE_CUB)
-  set(CUB_TOPLEVEL_PROJECT ON)
+  set(CUB_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT})
 endif()
 
 if (CCCL_ENABLE_THRUST)
-  set(THRUST_TOPLEVEL_PROJECT ON)
+  set(THRUST_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT})
 endif()
 
 if (CCCL_ENABLE_CUDAX)
-  set(cudax_TOPLEVEL_PROJECT ON)
+  set(cudax_TOPLEVEL_PROJECT ${CCCL_TOPLEVEL_PROJECT})
 endif()
 
 add_subdirectory(libcudacxx)

From 532ff47db0aeae4a3fd4a6e12514e89dc0550a31 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 15 Aug 2024 20:57:48 +0200
Subject: [PATCH 29/33] Rename the memory resources to drop the superfluous
 prefix `cuda_` (#2243)

---
 cudax/test/containers/uninitialized_buffer.cu |  4 +-
 ...ry_resource.h => device_memory_resource.h} | 69 +++++++++--------
 ...y_resource.h => managed_memory_resource.h} | 77 +++++++++----------
 ...ry_resource.h => pinned_memory_resource.h} | 77 +++++++++----------
 libcudacxx/include/cuda/memory_resource       |  6 +-
 .../allocate.pass.cpp                         |  2 +-
 .../equality.pass.cpp                         | 18 ++---
 .../traits.pass.cpp                           |  2 +-
 .../allocate.pass.cpp                         |  2 +-
 .../equality.pass.cpp                         | 18 ++---
 .../traits.pass.cpp                           |  2 +-
 .../allocate.pass.cpp                         |  2 +-
 .../equality.pass.cpp                         | 18 ++---
 .../traits.pass.cpp                           |  2 +-
 14 files changed, 150 insertions(+), 149 deletions(-)
 rename libcudacxx/include/cuda/__memory_resource/{cuda_memory_resource.h => device_memory_resource.h} (61%)
 rename libcudacxx/include/cuda/__memory_resource/{cuda_managed_memory_resource.h => managed_memory_resource.h} (58%)
 rename libcudacxx/include/cuda/__memory_resource/{cuda_pinned_memory_resource.h => pinned_memory_resource.h} (58%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_memory_resource => device_memory_resource}/allocate.pass.cpp (98%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_memory_resource => device_memory_resource}/equality.pass.cpp (88%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_memory_resource => device_memory_resource}/traits.pass.cpp (96%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_managed_memory_resource => managed_memory_resource}/allocate.pass.cpp (97%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_managed_memory_resource => managed_memory_resource}/equality.pass.cpp (84%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_pinned_memory_resource => managed_memory_resource}/traits.pass.cpp (95%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_pinned_memory_resource => pinned_memory_resource}/allocate.pass.cpp (97%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_pinned_memory_resource => pinned_memory_resource}/equality.pass.cpp (83%)
 rename libcudacxx/test/libcudacxx/cuda/memory_resource/{cuda_managed_memory_resource => pinned_memory_resource}/traits.pass.cpp (95%)

diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu
index 73b2a93887..56872e0e54 100644
--- a/cudax/test/containers/uninitialized_buffer.cu
+++ b/cudax/test/containers/uninitialized_buffer.cu
@@ -61,7 +61,7 @@ TEMPLATE_TEST_CASE(
   static_assert(!cuda::std::is_copy_constructible<uninitialized_buffer>::value, "");
   static_assert(!cuda::std::is_copy_assignable<uninitialized_buffer>::value, "");
 
-  cuda::mr::cuda_memory_resource resource{};
+  cuda::mr::device_memory_resource resource{};
 
   SECTION("construction")
   {
@@ -89,7 +89,7 @@ TEMPLATE_TEST_CASE(
   {
     static_assert(!cuda::std::is_copy_assignable<uninitialized_buffer>::value, "");
     {
-      cuda::mr::cuda_managed_memory_resource other_resource{};
+      cuda::mr::managed_memory_resource other_resource{};
       uninitialized_buffer input{other_resource, 42};
       uninitialized_buffer buf{resource, 1337};
       const auto* old_ptr       = buf.data();
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
similarity index 61%
rename from libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h
rename to libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
index 289dc8c8b3..02e367e041 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
@@ -39,20 +39,20 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-//! @brief cuda_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation.
+//! @brief device_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation.
 //! By default uses device 0 to allocate memory
-class cuda_memory_resource
+class device_memory_resource
 {
 private:
   int __device_id_{0};
 
 public:
-  //! @brief default constructs a cuda_memory_resource allocating memory on device 0
-  cuda_memory_resource() = default;
+  //! @brief default constructs a device_memory_resource allocating memory on device 0
+  device_memory_resource() = default;
 
-  //! @brief default constructs a cuda_memory_resource allocating memory on device \p __device_id
+  //! @brief default constructs a device_memory_resource allocating memory on device \p __device_id
   //! @param __device_id The id of the device we are allocating memory on
-  constexpr cuda_memory_resource(const int __device_id) noexcept
+  constexpr device_memory_resource(const int __device_id) noexcept
       : __device_id_(__device_id)
   {}
 
@@ -85,65 +85,65 @@ class cuda_memory_resource
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to cuda_memory_resource::deallocate.");
-    _CCCL_ASSERT_CUDA_API(::cudaFree, "cuda_memory_resource::deallocate failed", __ptr);
+                       "Invalid alignment passed to device_memory_resource::deallocate.");
+    _CCCL_ASSERT_CUDA_API(::cudaFree, "device_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }
 
-  //! @brief Equality comparison with another \c cuda_memory_resource
-  //! @param __other The other \c cuda_memory_resource
+  //! @brief Equality comparison with another \c device_memory_resource
+  //! @param __other The other \c device_memory_resource
   //! @return true, if both resources hold the same device id
-  _CCCL_NODISCARD constexpr bool operator==(cuda_memory_resource const& __other) const noexcept
+  _CCCL_NODISCARD constexpr bool operator==(device_memory_resource const& __other) const noexcept
   {
     return __device_id_ == __other.__device_id_;
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @brief Inequality comparison with another \c cuda_memory_resource
-  //! @param __other The other \c cuda_memory_resource
+  //! @brief Inequality comparison with another \c device_memory_resource
+  //! @param __other The other \c device_memory_resource
   //! @return true, if both resources hold different device id's
-  _CCCL_NODISCARD constexpr bool operator!=(cuda_memory_resource const& __other) const noexcept
+  _CCCL_NODISCARD constexpr bool operator!=(device_memory_resource const& __other) const noexcept
   {
     return __device_id_ != __other.__device_id_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  //! @brief Equality comparison between a \c cuda_memory_resource and another resource
-  //! @param __lhs The \c cuda_memory_resource
+  //! @brief Equality comparison between a \c device_memory_resource and another resource
+  //! @param __lhs The \c device_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(cuda_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<device_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
+  //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_memory_resource const& __lhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<device_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
+  //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(cuda_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<device_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
+  //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_memory_resource const& __lhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<device_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    endif // _CCCL_STD_VER <= 2017
 
   //! @brief Enables the \c device_accessible property
-  friend constexpr void get_property(cuda_memory_resource const&, device_accessible) noexcept {}
+  friend constexpr void get_property(device_memory_resource const&, device_accessible) noexcept {}
 
   //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
@@ -151,7 +151,10 @@ class cuda_memory_resource
     return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0);
   }
 };
-static_assert(resource_with<cuda_memory_resource, device_accessible>, "");
+static_assert(resource_with<device_memory_resource, device_accessible>, "");
+
+// For backward compatability
+using cuda_memory_resource _LIBCUDACXX_DEPRECATED = device_memory_resource;
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
similarity index 58%
rename from libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
rename to libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
index a8a42841de..d899ab95a2 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
@@ -38,8 +38,8 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-//! @brief \c cuda_managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation.
-class cuda_managed_memory_resource
+//! @brief \c managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation.
+class managed_memory_resource
 {
 private:
   unsigned int __flags_ = cudaMemAttachGlobal;
@@ -47,10 +47,10 @@ class cuda_managed_memory_resource
   static constexpr unsigned int __available_flags = cudaMemAttachGlobal | cudaMemAttachHost;
 
 public:
-  constexpr cuda_managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept
+  constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept
       : __flags_(__flags & __available_flags)
   {
-    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to cuda_managed_memory_resource");
+    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource");
   }
 
   //! @brief Allocate CUDA unified memory of size at least \p __bytes.
@@ -80,74 +80,70 @@ class cuda_managed_memory_resource
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to cuda_managed_memory_resource::deallocate.");
-    _CCCL_ASSERT_CUDA_API(::cudaFree, "cuda_managed_memory_resource::deallocate failed", __ptr);
+                       "Invalid alignment passed to managed_memory_resource::deallocate.");
+    _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }
 
-  //! @brief Equality comparison with another \c cuda_managed_memory_resource
-  //! @param __other The other \c cuda_managed_memory_resource
-  //! @return Whether both \c cuda_managed_memory_resource were constructed with the same flags
-  _CCCL_NODISCARD constexpr bool operator==(cuda_managed_memory_resource const& __other) const noexcept
+  //! @brief Equality comparison with another \c managed_memory_resource
+  //! @param __other The other \c managed_memory_resource
+  //! @return Whether both \c managed_memory_resource were constructed with the same flags
+  _CCCL_NODISCARD constexpr bool operator==(managed_memory_resource const& __other) const noexcept
   {
     return __flags_ == __other.__flags_;
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @brief Inequality comparison with another \c cuda_managed_memory_resource
-  //! @param __other The other \c cuda_managed_memory_resource
-  //! @return Whether both \c cuda_managed_memory_resource were constructed with different flags
-  _CCCL_NODISCARD constexpr bool operator!=(cuda_managed_memory_resource const& __other) const noexcept
+  //! @brief Inequality comparison with another \c managed_memory_resource
+  //! @param __other The other \c managed_memory_resource
+  //! @return Whether both \c managed_memory_resource were constructed with different flags
+  _CCCL_NODISCARD constexpr bool operator!=(managed_memory_resource const& __other) const noexcept
   {
     return __flags_ != __other.__flags_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  //! @brief Equality comparison between a \c cuda_managed_memory_resource and another resource
-  //! @param __lhs The \c cuda_managed_memory_resource
+  //! @brief Equality comparison between a \c managed_memory_resource and another resource
+  //! @param __lhs The \c managed_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(cuda_managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_managed_memory_resource&>(__lhs)}
-        == resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<managed_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource
+  //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource
   //! const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_managed_memory_resource const& __lhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_managed_memory_resource&>(__lhs)}
-        == resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<managed_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource
+  //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource
   //! const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(cuda_managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator!=(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_managed_memory_resource&>(__lhs)}
-        != resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<managed_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource
+  //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource
   //! const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_managed_memory_resource const& __lhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_managed_memory_resource&>(__lhs)}
-        != resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<managed_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    endif // _CCCL_STD_VER <= 2017
 
   //! @brief Enables the \c device_accessible property
-  friend constexpr void get_property(cuda_managed_memory_resource const&, device_accessible) noexcept {}
+  friend constexpr void get_property(managed_memory_resource const&, device_accessible) noexcept {}
   //! @brief Enables the \c host_accessible property
-  friend constexpr void get_property(cuda_managed_memory_resource const&, host_accessible) noexcept {}
+  friend constexpr void get_property(managed_memory_resource const&, host_accessible) noexcept {}
 
   //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
@@ -155,8 +151,11 @@ class cuda_managed_memory_resource
     return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0);
   }
 };
-static_assert(resource_with<cuda_managed_memory_resource, device_accessible>, "");
-static_assert(resource_with<cuda_managed_memory_resource, host_accessible>, "");
+static_assert(resource_with<managed_memory_resource, device_accessible>, "");
+static_assert(resource_with<managed_memory_resource, host_accessible>, "");
+
+// For backward compatability
+using cuda_managed_memory_resource _LIBCUDACXX_DEPRECATED = managed_memory_resource;
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
similarity index 58%
rename from libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
rename to libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
index f8fc3a25ce..c33ad10235 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
@@ -39,8 +39,8 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-//! @brief cuda_pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation.
-class cuda_pinned_memory_resource
+//! @brief pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation.
+class pinned_memory_resource
 {
 private:
   unsigned int __flags_ = cudaHostAllocDefault;
@@ -49,10 +49,10 @@ class cuda_pinned_memory_resource
     cudaHostAllocDefault | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined;
 
 public:
-  constexpr cuda_pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept
+  constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept
       : __flags_(__flags & __available_flags)
   {
-    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to cuda_pinned_memory_resource");
+    _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource");
   }
 
   //! @brief Allocate host memory of size at least \p __bytes.
@@ -82,71 +82,67 @@ class cuda_pinned_memory_resource
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
-                       "Invalid alignment passed to cuda_pinned_memory_resource::deallocate.");
-    _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "cuda_pinned_memory_resource::deallocate failed", __ptr);
+                       "Invalid alignment passed to pinned_memory_resource::deallocate.");
+    _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr);
     (void) __alignment;
   }
 
-  //! @brief Equality comparison with another \c cuda_pinned_memory_resource
-  //! @param __other The other \c cuda_pinned_memory_resource
-  //! @return Whether both \c cuda_pinned_memory_resource were constructed with the same flags
-  _CCCL_NODISCARD constexpr bool operator==(cuda_pinned_memory_resource const& __other) const noexcept
+  //! @brief Equality comparison with another \c pinned_memory_resource
+  //! @param __other The other \c pinned_memory_resource
+  //! @return Whether both \c pinned_memory_resource were constructed with the same flags
+  _CCCL_NODISCARD constexpr bool operator==(pinned_memory_resource const& __other) const noexcept
   {
     return __flags_ == __other.__flags_;
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @brief Equality comparison with another \c cuda_pinned_memory_resource
-  //! @param __other The other \c cuda_pinned_memory_resource
-  //! @return Whether both \c cuda_pinned_memory_resource were constructed with different flags
-  _CCCL_NODISCARD constexpr bool operator!=(cuda_pinned_memory_resource const& __other) const noexcept
+  //! @brief Equality comparison with another \c pinned_memory_resource
+  //! @param __other The other \c pinned_memory_resource
+  //! @return Whether both \c pinned_memory_resource were constructed with different flags
+  _CCCL_NODISCARD constexpr bool operator!=(pinned_memory_resource const& __other) const noexcept
   {
     return __flags_ != __other.__flags_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  //! @brief Equality comparison between a \c cuda_pinned_memory_resource and another resource
-  //! @param __lhs The \c cuda_pinned_memory_resource
+  //! @brief Equality comparison between a \c pinned_memory_resource and another resource
+  //! @param __lhs The \c pinned_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(cuda_pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_pinned_memory_resource&>(__lhs)}
-        == resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<pinned_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
+  //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_pinned_memory_resource const& __lhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_pinned_memory_resource&>(__lhs)}
-        == resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<pinned_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
+  //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(cuda_pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator!=(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_pinned_memory_resource&>(__lhs)}
-        != resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<pinned_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
+  //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&)
   template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_pinned_memory_resource const& __lhs) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
+  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
   {
-    return resource_ref<>{const_cast<cuda_pinned_memory_resource&>(__lhs)}
-        != resource_ref<>{const_cast<_Resource&>(__rhs)};
+    return resource_ref<>{const_cast<pinned_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    endif // _CCCL_STD_VER <= 2017
 
   //! @brief Enables the \c device_accessible property
-  friend constexpr void get_property(cuda_pinned_memory_resource const&, device_accessible) noexcept {}
+  friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {}
   //! @brief Enables the \c host_accessible property
-  friend constexpr void get_property(cuda_pinned_memory_resource const&, host_accessible) noexcept {}
+  friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {}
 
   //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
@@ -154,8 +150,11 @@ class cuda_pinned_memory_resource
     return __alignment <= default_cuda_malloc_host_alignment && (default_cuda_malloc_host_alignment % __alignment == 0);
   }
 };
-static_assert(resource_with<cuda_pinned_memory_resource, device_accessible>, "");
-static_assert(resource_with<cuda_pinned_memory_resource, host_accessible>, "");
+static_assert(resource_with<pinned_memory_resource, device_accessible>, "");
+static_assert(resource_with<pinned_memory_resource, host_accessible>, "");
+
+// For backward compatability
+using cuda_pinned_memory_resource _LIBCUDACXX_DEPRECATED = pinned_memory_resource;
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource
index a6aced1179..d3c1ae1f91 100644
--- a/libcudacxx/include/cuda/memory_resource
+++ b/libcudacxx/include/cuda/memory_resource
@@ -31,10 +31,10 @@
 //!
 //!@endrst
 
-#include <cuda/__memory_resource/cuda_managed_memory_resource.h>
-#include <cuda/__memory_resource/cuda_memory_resource.h>
-#include <cuda/__memory_resource/cuda_pinned_memory_resource.h>
+#include <cuda/__memory_resource/device_memory_resource.h>
 #include <cuda/__memory_resource/get_property.h>
+#include <cuda/__memory_resource/managed_memory_resource.h>
+#include <cuda/__memory_resource/pinned_memory_resource.h>
 #include <cuda/__memory_resource/properties.h>
 #include <cuda/__memory_resource/resource.h>
 #include <cuda/__memory_resource/resource_ref.h>
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
similarity index 98%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
index 073de36074..51c4a5e830 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
@@ -30,7 +30,7 @@ void ensure_device_ptr(void* ptr)
 
 void test()
 {
-  cuda::mr::cuda_memory_resource res{};
+  cuda::mr::device_memory_resource res{};
 
   { // allocate / deallocate
     auto* ptr = res.allocate(42);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
similarity index 88%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
index 50fd7476ba..770e0d71d7 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
@@ -67,9 +67,9 @@ static_assert(cuda::mr::async_resource_with<async_resource<AccessibilityType::De
               "");
 
 // test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
-struct derived_resource : cuda::mr::cuda_memory_resource
+struct derived_resource : cuda::mr::device_memory_resource
 {
-  using cuda::mr::cuda_memory_resource::cuda_memory_resource;
+  using cuda::mr::device_memory_resource::device_memory_resource;
 };
 static_assert(cuda::mr::resource<derived_resource>, "");
 
@@ -77,15 +77,15 @@ static_assert(cuda::mr::resource<derived_resource>, "");
 
 void test()
 {
-  cuda::mr::cuda_memory_resource first{};
-  { // comparison against a plain cuda_memory_resource
-    cuda::mr::cuda_memory_resource second{};
+  cuda::mr::device_memory_resource first{};
+  { // comparison against a plain device_memory_resource
+    cuda::mr::device_memory_resource second{};
     assert(first == second);
     assert(!(first != second));
   }
 
-  { // comparison against a cuda_memory_resource wrapped inside a resource_ref<device_accessible>
-    cuda::mr::cuda_memory_resource second{};
+  { // comparison against a device_memory_resource wrapped inside a resource_ref<device_accessible>
+    cuda::mr::device_memory_resource second{};
     cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
     assert(first == second_ref);
     assert(!(first != second_ref));
@@ -93,8 +93,8 @@ void test()
     assert(!(second_ref != first));
   }
 
-  { // comparison against a cuda_memory_resource wrapped inside a resource_ref<>
-    cuda::mr::cuda_memory_resource second{};
+  { // comparison against a device_memory_resource wrapped inside a resource_ref<>
+    cuda::mr::device_memory_resource second{};
     cuda::mr::resource_ref<> second_ref{second};
     assert(first == second_ref);
     assert(!(first != second_ref));
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp
similarity index 96%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp
index a8ae126fce..d642b83bf0 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp
@@ -15,7 +15,7 @@
 #include <cuda/memory_resource>
 #include <cuda/std/type_traits>
 
-using resource = cuda::mr::cuda_memory_resource;
+using resource = cuda::mr::device_memory_resource;
 static_assert(!cuda::std::is_trivial<resource>::value, "");
 static_assert(!cuda::std::is_trivially_default_constructible<resource>::value, "");
 static_assert(cuda::std::is_trivially_copy_constructible<resource>::value, "");
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
similarity index 97%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/allocate.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
index 6e9fd76f8e..df0652d5a1 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
@@ -30,7 +30,7 @@ void ensure_managed_ptr(void* ptr)
 
 void test(const unsigned int flag)
 {
-  cuda::mr::cuda_managed_memory_resource res{flag};
+  cuda::mr::managed_memory_resource res{flag};
 
   { // allocate / deallocate
     auto* ptr = res.allocate(42);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
similarity index 84%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
index f2e14578f7..9acc1e3813 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
@@ -57,29 +57,29 @@ static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>,
 static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
 
 // test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
-struct derived_managed_resource : cuda::mr::cuda_managed_memory_resource
+struct derived_managed_resource : cuda::mr::managed_memory_resource
 {
-  using cuda::mr::cuda_managed_memory_resource::cuda_managed_memory_resource;
+  using cuda::mr::managed_memory_resource::managed_memory_resource;
 };
 static_assert(cuda::mr::resource<derived_managed_resource>, "");
 
 void test()
 {
-  cuda::mr::cuda_managed_memory_resource first{};
-  { // comparison against a plain cuda_managed_memory_resource
-    cuda::mr::cuda_managed_memory_resource second{};
+  cuda::mr::managed_memory_resource first{};
+  { // comparison against a plain managed_memory_resource
+    cuda::mr::managed_memory_resource second{};
     assert(first == second);
     assert(!(first != second));
   }
 
-  { // comparison against a plain cuda_managed_memory_resource with a different flag set
-    cuda::mr::cuda_managed_memory_resource second{cudaMemAttachHost};
+  { // comparison against a plain managed_memory_resource with a different flag set
+    cuda::mr::managed_memory_resource second{cudaMemAttachHost};
     assert(!(first == second));
     assert((first != second));
   }
 
-  { // comparison against a cuda_managed_memory_resource wrapped inside a resource_ref<>
-    cuda::mr::cuda_managed_memory_resource second{};
+  { // comparison against a managed_memory_resource wrapped inside a resource_ref<>
+    cuda::mr::managed_memory_resource second{};
     assert(first == cuda::mr::resource_ref<>{second});
     assert(!(first != cuda::mr::resource_ref<>{second}));
     assert(cuda::mr::resource_ref<>{second} == first);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp
similarity index 95%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/traits.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp
index 3909ac7238..02b9bd0294 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp
@@ -15,7 +15,7 @@
 #include <cuda/memory_resource>
 #include <cuda/std/type_traits>
 
-using resource = cuda::mr::cuda_pinned_memory_resource;
+using resource = cuda::mr::managed_memory_resource;
 static_assert(!cuda::std::is_trivial<resource>::value, "");
 static_assert(!cuda::std::is_trivially_default_constructible<resource>::value, "");
 static_assert(cuda::std::is_trivially_copy_constructible<resource>::value, "");
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
similarity index 97%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/allocate.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
index 7b9e374805..3ad0ae106b 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
@@ -30,7 +30,7 @@ void ensure_pinned_host_ptr(void* ptr)
 
 void test(const unsigned int flag)
 {
-  cuda::mr::cuda_pinned_memory_resource res{flag};
+  cuda::mr::pinned_memory_resource res{flag};
 
   { // allocate / deallocate
     auto* ptr = res.allocate(42);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
similarity index 83%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
index dd480cc9f7..1d60ea1ecb 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_pinned_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
@@ -57,29 +57,29 @@ static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>,
 static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
 
 // test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
-struct derived_pinned_resource : cuda::mr::cuda_pinned_memory_resource
+struct derived_pinned_resource : cuda::mr::pinned_memory_resource
 {
-  using cuda::mr::cuda_pinned_memory_resource::cuda_pinned_memory_resource;
+  using cuda::mr::pinned_memory_resource::pinned_memory_resource;
 };
 static_assert(cuda::mr::resource<derived_pinned_resource>, "");
 
 void test()
 {
-  cuda::mr::cuda_pinned_memory_resource first{};
-  { // comparison against a plain cuda_pinned_memory_resource
-    cuda::mr::cuda_pinned_memory_resource second{cudaHostAllocDefault};
+  cuda::mr::pinned_memory_resource first{};
+  { // comparison against a plain pinned_memory_resource
+    cuda::mr::pinned_memory_resource second{cudaHostAllocDefault};
     assert(first == second);
     assert(!(first != second));
   }
 
-  { // comparison against a plain cuda_pinned_memory_resource with a different flag set
-    cuda::mr::cuda_pinned_memory_resource second{cudaHostAllocPortable};
+  { // comparison against a plain pinned_memory_resource with a different flag set
+    cuda::mr::pinned_memory_resource second{cudaHostAllocPortable};
     assert(!(first == second));
     assert((first != second));
   }
 
-  { // comparison against a cuda_pinned_memory_resource wrapped inside a resource_ref<>
-    cuda::mr::cuda_pinned_memory_resource second{};
+  { // comparison against a pinned_memory_resource wrapped inside a resource_ref<>
+    cuda::mr::pinned_memory_resource second{};
     cuda::mr::resource_ref<> second_ref{second};
     assert(first == second_ref);
     assert(!(first != second_ref));
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp
similarity index 95%
rename from libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/traits.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp
index 299247ff2e..b0bbae9526 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_managed_memory_resource/traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp
@@ -15,7 +15,7 @@
 #include <cuda/memory_resource>
 #include <cuda/std/type_traits>
 
-using resource = cuda::mr::cuda_managed_memory_resource;
+using resource = cuda::mr::pinned_memory_resource;
 static_assert(!cuda::std::is_trivial<resource>::value, "");
 static_assert(!cuda::std::is_trivially_default_constructible<resource>::value, "");
 static_assert(cuda::std::is_trivially_copy_constructible<resource>::value, "");

From 16d4fd3c96225366c826f60e947ec1c472ef3082 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Fri, 16 Aug 2024 04:44:21 -0700
Subject: [PATCH 30/33] Fix and simplify <bit> (#2197)

* Fix and simplify <bit>

* Make logic for non-constant evaluation simpler in C++14 and greater in <bit>

* Remove use of `std::` in `<bit>`

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>

* Change bitops tests to prevent constant folding of runtime checks

* Move bit and split implementation details from main header

* Remove volatile from tests in bitops

* Make Windows happy by using `unsigned long`

* Work around being unable to use {} in c++ constexpr functions

* Add a 'default to constexpr' interpretation of is_constant_evaluated for internal use in bitops

* Make windows happy by reusing the default to constexpr hack

* Make bitops tests definitely actually do runtime

* Move <bit> fallbacks into relevant headers

* Fix fallbacks being guarded by MSVC ifdef.

* Keep the license

---------

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/include/cuda/std/__bit/clz.h       | 153 ++++
 libcudacxx/include/cuda/std/__bit/ctz.h       | 155 ++++
 libcudacxx/include/cuda/std/__bit/popc.h      | 118 +++
 .../std/__type_traits/is_constant_evaluated.h |  10 +
 libcudacxx/include/cuda/std/bit               | 348 +++++++-
 .../cuda/std/detail/libcxx/include/bit        | 815 ------------------
 .../bit/bitops.count/countl_one.pass.cpp      |  77 +-
 .../bit/bitops.count/countl_zero.pass.cpp     |  55 +-
 .../bit/bitops.count/countr_one.pass.cpp      |  73 +-
 .../bit/bitops.count/countr_zero.pass.cpp     |  75 +-
 .../bit/bitops.count/popcount.pass.cpp        |  73 +-
 11 files changed, 990 insertions(+), 962 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__bit/clz.h
 create mode 100644 libcudacxx/include/cuda/std/__bit/ctz.h
 create mode 100644 libcudacxx/include/cuda/std/__bit/popc.h
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/bit

diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h
new file mode 100644
index 0000000000..84dbcd686a
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/clz.h
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX__BIT_CLZ_H
+#define _LIBCUDACXX__BIT_CLZ_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/cstdint>
+
+#if defined(_CCCL_COMPILER_MSVC)
+#  include <intrin.h>
+#endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz2(uint64_t __x, int __c)
+{
+  return !!(~__x & 0x2) ^ __c;
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz4(uint64_t __x, int __c)
+{
+  return __binary_clz2(__x >> 2 * !!(__x & 0xC), __c + 2 * !(__x & 0xC));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz8(uint64_t __x, int __c)
+{
+  return __binary_clz4(__x >> 4 * !!(__x & 0xF0), __c + 4 * !(__x & 0xF0));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz16(uint64_t __x, int __c)
+{
+  return __binary_clz8(__x >> 8 * !!(__x & 0xFF00), __c + 8 * !(__x & 0xFF00));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz32(uint64_t __x, int __c)
+{
+  return __binary_clz16(__x >> 16 * !!(__x & 0xFFFF0000), __c + 16 * !(__x & 0xFFFF0000));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz64(uint64_t __x)
+{
+  return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000));
+}
+
+#if !defined(_CCCL_COMPILER_MSVC)
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint32_t __x) noexcept
+{
+#  if defined(__CUDA_ARCH__)
+  return __binary_clz32(static_cast<uint64_t>(__x), 0); // no device constexpr builtins
+#  else
+  return __builtin_clz(__x);
+#  endif
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint64_t __x) noexcept
+{
+#  if defined(__CUDA_ARCH__)
+  return __binary_clz64(__x); // no device constexpr builtins
+#  else
+  return __builtin_clzll(__x);
+#  endif
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2014
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clz(__x);), (return __builtin_clz(__x);))
+  }
+#  endif
+  return __constexpr_clz(__x);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint64_t __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2014
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzll(__x);))
+  }
+#  endif
+  return __constexpr_clz(__x);
+}
+
+#else // defined(_CCCL_COMPILER_MSVC)
+
+// Precondition:  __x != 0
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x)
+{
+#  if !defined(__CUDA_ARCH__)
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    unsigned long __where = 0;
+    if (_BitScanReverse(&__where, __x))
+    {
+      return static_cast<int>(31 - __where);
+    }
+    return 32; // Undefined Behavior.
+  }
+#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
+
+  return __binary_clz32(static_cast<uint64_t>(__x), 0);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint64_t __x)
+{
+#  if !defined(__CUDA_ARCH__)
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    unsigned long __where = 0;
+#    if defined(_LIBCUDACXX_HAS_BITSCAN64)
+    if (_BitScanReverse64(&__where, __x))
+    {
+      return static_cast<int>(63 - __where);
+    }
+#    else
+    // Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls.
+    if (_BitScanReverse(&__where, static_cast<uint32_t>(__x >> 32)))
+    {
+      return static_cast<int>(63 - (__where + 32));
+    }
+    if (_BitScanReverse(&__where, static_cast<uint32_t>(__x)))
+    {
+      return static_cast<int>(63 - __where);
+    }
+#    endif
+    return 64; // Undefined Behavior.
+  }
+#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
+
+  return __binary_clz64(static_cast<uint64_t>(__x));
+}
+
+#endif
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX__BIT_CLZ_H
diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h
new file mode 100644
index 0000000000..4715386921
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/ctz.h
@@ -0,0 +1,155 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX__BIT_CTZ_H
+#define _LIBCUDACXX__BIT_CTZ_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/cstdint>
+
+#if defined(_CCCL_COMPILER_MSVC)
+#  include <intrin.h>
+#endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz2(uint64_t __x, int __c) noexcept
+{
+  return __c + !(__x & 0x1);
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz4(uint64_t __x, int __c) noexcept
+{
+  return __binary_ctz2(__x >> 2 * !(__x & 0x3), __c + 2 * !(__x & 0x3));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz8(uint64_t __x, int __c) noexcept
+{
+  return __binary_ctz4(__x >> 4 * !(__x & 0x0F), __c + 4 * !(__x & 0x0F));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz16(uint64_t __x, int __c) noexcept
+{
+  return __binary_ctz8(__x >> 8 * !(__x & 0x00FF), __c + 8 * !(__x & 0x00FF));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz32(uint64_t __x, int __c) noexcept
+{
+  return __binary_ctz16(__x >> 16 * !(__x & 0x0000FFFF), __c + 16 * !(__x & 0x0000FFFF));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz64(uint64_t __x) noexcept
+{
+  return __binary_ctz32(__x >> 32 * !(__x & 0x00000000FFFFFFFF), 32 * !(__x & 0x00000000FFFFFFFF));
+}
+
+#if !defined(_CCCL_COMPILER_MSVC)
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint32_t __x) noexcept
+{
+#  if defined(__CUDA_ARCH__)
+  return __binary_ctz32(static_cast<uint64_t>(__x), 0); // no device constexpr builtins
+#  else
+  return __builtin_ctz(__x);
+#  endif
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint64_t __x) noexcept
+{
+#  if defined(__CUDA_ARCH__)
+  return __binary_ctz64(__x); // no device constexpr builtins
+#  else
+  return __builtin_ctzll(__x);
+#  endif
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2014
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_DEVICE, (return (!__x) ? (sizeof(uint32_t) * 8) : (__ffs(__x) - 1);), (return __builtin_ctz(__x);))
+  }
+#  endif
+  return __constexpr_ctz(__x);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint64_t __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2014
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_DEVICE, (return (!__x) ? (sizeof(uint64_t) * 8) : (__ffsll(__x) - 1);), (return __builtin_ctzll(__x);))
+  }
+#  endif
+  return __constexpr_ctz(__x);
+}
+
+#else // defined(_CCCL_COMPILER_MSVC)
+
+// Precondition:  __x != 0
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x)
+{
+#  if !defined(__CUDA_ARCH__)
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    unsigned long __where = 0;
+    if (_BitScanForward(&__where, __x))
+    {
+      return static_cast<int>(__where);
+    }
+    return 32;
+  }
+#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
+
+  return __binary_ctz32(static_cast<uint64_t>(__x), 0);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint64_t __x)
+{
+#  if !defined(__CUDA_ARCH__)
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    unsigned long __where = 0;
+#    if defined(_LIBCUDACXX_HAS_BITSCAN64) && (defined(_M_AMD64) || defined(__x86_64__))
+    if (_BitScanForward64(&__where, __x))
+    {
+      return static_cast<int>(__where);
+    }
+#    else
+    // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls.
+    if (_BitScanForward(&__where, static_cast<uint32_t>(__x)))
+    {
+      return static_cast<int>(__where);
+    }
+    if (_BitScanForward(&__where, static_cast<uint32_t>(__x >> 32)))
+    {
+      return static_cast<int>(__where + 32);
+    }
+#    endif
+    return 64;
+  }
+#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
+
+  return __binary_ctz64(__x);
+}
+
+#endif
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX__BIT_CTZ_H
diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h
new file mode 100644
index 0000000000..23b24a2bb0
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/popc.h
@@ -0,0 +1,118 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX__BIT_POPC_H
+#define _LIBCUDACXX__BIT_POPC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/cstdint>
+
+#if defined(_CCCL_COMPILER_MSVC)
+#  include <intrin.h>
+#endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc8(uint64_t __x)
+{
+  return static_cast<int>((__x * 0x0101010101010101) >> 56);
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc16(uint64_t __x)
+{
+  return __fallback_popc8((__x + (__x >> 4)) & 0x0f0f0f0f0f0f0f0f);
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc32(uint64_t __x)
+{
+  return __fallback_popc16((__x & 0x3333333333333333) + ((__x >> 2) & 0x3333333333333333));
+}
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc64(uint64_t __x)
+{
+  return __fallback_popc32(__x - ((__x >> 1) & 0x5555555555555555));
+}
+
+#if !defined(_CCCL_COMPILER_MSVC)
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint32_t __x) noexcept
+{
+#  if defined(__CUDA_ARCH__)
+  return __fallback_popc64(static_cast<uint64_t>(__x)); // no device constexpr builtins
+#  else
+  return __builtin_popcount(__x);
+#  endif
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint64_t __x) noexcept
+{
+#  if defined(__CUDA_ARCH__)
+  return __fallback_popc64(static_cast<uint64_t>(__x)); // no device constexpr builtins
+#  else
+  return __builtin_popcountll(__x);
+#  endif
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2014
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popc(__x);), (return __builtin_popcount(__x);))
+  }
+#  endif
+  return __constexpr_popcount(static_cast<uint64_t>(__x));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint64_t __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2014
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountll(__x);))
+  }
+#  endif
+  return __constexpr_popcount(static_cast<uint64_t>(__x));
+}
+
+#else // defined(_CCCL_COMPILER_MSVC)
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x)
+{
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(__popcnt(__x));))
+  }
+
+  return __fallback_popc64(static_cast<uint64_t>(__x));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint64_t __x)
+{
+  if (!__libcpp_default_is_constant_evaluated())
+  {
+    NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(__popcnt64(__x));))
+  }
+
+  return __fallback_popc64(static_cast<uint64_t>(__x));
+}
+
+#endif // MSVC
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX__BIT_POPC_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
index 6d667ab45f..577561a6b2 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
@@ -12,6 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include "cuda/std/detail/libcxx/include/__config"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -32,11 +34,19 @@ inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_is_constant_evaluat
 {
   return _LIBCUDACXX_IS_CONSTANT_EVALUATED();
 }
+inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_default_is_constant_evaluated() noexcept
+{
+  return _LIBCUDACXX_IS_CONSTANT_EVALUATED();
+}
 #else // ^^^ _LIBCUDACXX_IS_CONSTANT_EVALUATED ^^^ / vvv !_LIBCUDACXX_IS_CONSTANT_EVALUATED vvv
 inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_is_constant_evaluated() noexcept
 {
   return false;
 }
+inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_default_is_constant_evaluated() noexcept
+{
+  return true;
+}
 #endif // !_LIBCUDACXX_IS_CONSTANT_EVALUATED
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/bit b/libcudacxx/include/cuda/std/bit
index 0460e078d7..9106fa588f 100644
--- a/libcudacxx/include/cuda/std/bit
+++ b/libcudacxx/include/cuda/std/bit
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,9 +21,353 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__bit/clz.h>
+#include <cuda/std/__bit/ctz.h>
+#include <cuda/std/__bit/popc.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/__debug>
+#include <cuda/std/limits>
+#include <cuda/std/version>
+
 _CCCL_PUSH_MACROS
 
-#include <cuda/std/detail/libcxx/include/bit>
+#if defined(_CCCL_COMPILER_IBM)
+#  include <cuda/std/detail/libcxx/include/support/ibm/support.h>
+#endif // _CCCL_COMPILER_IBM
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __rotl(_Tp __t, uint32_t __cnt) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned");
+  using __nlt = numeric_limits<_Tp>;
+
+  return ((__cnt % __nlt::digits) == 0)
+         ? __t
+         : (__t << (__cnt % __nlt::digits)) | (__t >> (__nlt::digits - (__cnt % __nlt::digits)));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __rotr(_Tp __t, uint32_t __cnt) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned");
+  using __nlt = numeric_limits<_Tp>;
+
+  return ((__cnt % __nlt::digits) == 0)
+         ? __t
+         : (__t >> (__cnt % __nlt::digits)) | (__t << (__nlt::digits - (__cnt % __nlt::digits)));
+}
+
+// Forward decl for recursive use in split word operations
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_zero(_Tp __t) noexcept;
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
+__countr_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_ctz(static_cast<uint32_t>(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
+__countr_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_ctz(static_cast<uint64_t>(__t));
+}
+
+template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
+struct __countr_zero_rsh_impl
+{
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __short_circuit(_Tp __t, int __cur, int __count)
+  {
+    // Stops processing early if non-zero
+    return (__cur == numeric_limits<uint64_t>::digits)
+           ? __countr_zero_rsh_impl<_Tp, _St - 1>::__count(__t, __cur + __count)
+           : __cur + __count;
+  }
+
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t, int __count)
+  {
+    return __short_circuit(__t >> numeric_limits<uint64_t>::digits, __countr_zero(static_cast<uint64_t>(__t)), __count);
+  }
+};
+
+template <typename _Tp>
+struct __countr_zero_rsh_impl<_Tp, 1>
+{
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t, int __count)
+  {
+    return __count + __countr_zero(static_cast<uint64_t>(__t));
+  }
+};
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
+__countr_zero_dispatch(_Tp __t) noexcept
+{
+  return __countr_zero_rsh_impl<_Tp>::__count(__t, 0);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_zero(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned");
+
+  return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
+}
+
+// Forward decl for recursive use in split word operations
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_zero(_Tp __t) noexcept;
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
+__countl_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_clz(static_cast<uint32_t>(__t)) - (numeric_limits<uint32_t>::digits - numeric_limits<_Tp>::digits);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
+__countl_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_clz(static_cast<uint64_t>(__t)) - (numeric_limits<uint64_t>::digits - numeric_limits<_Tp>::digits);
+}
+
+template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
+struct __countl_zero_rotl_impl
+{
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __short_circuit(_Tp __t, int __cur)
+  {
+    // This stops processing early if the current word is not empty
+    return (__cur == numeric_limits<uint64_t>::digits)
+           ? __cur + __countl_zero_rotl_impl<_Tp, _St - 1>::__count(__t)
+           : __cur;
+  }
+
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_iter(_Tp __t)
+  {
+    // After rotating pass result of clz to another step for processing
+    return __short_circuit(__t, __countl_zero(static_cast<uint64_t>(__t)));
+  }
+
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
+  {
+    return __countl_iter(__rotl(__t, numeric_limits<uint64_t>::digits));
+  }
+};
+
+template <typename _Tp>
+struct __countl_zero_rotl_impl<_Tp, 1>
+{
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
+  {
+    return __countl_zero(static_cast<uint64_t>(__rotl(__t, numeric_limits<uint64_t>::digits)));
+  }
+};
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
+__countl_zero_dispatch(_Tp __t) noexcept
+{
+  return __countl_zero_rotl_impl<_Tp>::__count(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_zero(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
+  return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_one(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
+  return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_one(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned");
+  return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
+__popcount_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_popc(static_cast<uint32_t>(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
+__popcount_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_popc(static_cast<uint64_t>(__t));
+}
+
+template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
+struct __popcount_rsh_impl
+{
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
+  {
+    return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits<uint64_t>::digits)
+         + __libcpp_popc(static_cast<uint64_t>(__t));
+  }
+};
+
+template <typename _Tp>
+struct __popcount_rsh_impl<_Tp, 1>
+{
+  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
+  {
+    return __libcpp_popc(static_cast<uint64_t>(__t));
+  }
+};
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
+__popcount_dispatch(_Tp __t) noexcept
+{
+  return __popcount_rsh_impl<_Tp>::__count(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr int __popcount(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned");
+
+  return __popcount_dispatch(__t);
+}
+
+// integral log base 2
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr uint32_t __bit_log2(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
+  return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr bool __has_single_bit(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
+  return __t != 0 && (((__t & (__t - 1)) == 0));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+{
+  return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+{
+  return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
+                        + (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
+                >> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+rotl(_Tp __t, uint32_t __cnt) noexcept
+{
+  return __rotl(__t, __cnt);
+}
+
+// rotr
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+rotr(_Tp __t, uint32_t __cnt) noexcept
+{
+  return __rotr(__t, __cnt);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countl_zero(_Tp __t) noexcept
+{
+  return __countl_zero(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countl_one(_Tp __t) noexcept
+{
+  return __countl_one(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countr_zero(_Tp __t) noexcept
+{
+  return __countr_zero(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countr_one(_Tp __t) noexcept
+{
+  return __countr_one(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+popcount(_Tp __t) noexcept
+{
+  return __popcount(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool>
+has_single_bit(_Tp __t) noexcept
+{
+  return __has_single_bit(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+bit_floor(_Tp __t) noexcept
+{
+  return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+bit_ceil(_Tp __t) noexcept
+{
+  return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+bit_width(_Tp __t) noexcept
+{
+  return __t == 0 ? 0 : static_cast<_Tp>(__bit_log2(__t) + 1);
+}
+
+enum class endian
+{
+  little = 0xDEAD,
+  big    = 0xFACE,
+#if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+  native = little
+#elif defined(_LIBCUDACXX_BIG_ENDIAN)
+  native = big
+#else
+  native = 0xCAFE
+#endif
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_POP_MACROS
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/bit b/libcudacxx/include/cuda/std/detail/libcxx/include/bit
deleted file mode 100644
index 641a743832..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/bit
+++ /dev/null
@@ -1,815 +0,0 @@
-// -*- C++ -*-
-//===------------------------------ bit ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===---------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_BIT
-#define _LIBCUDACXX_BIT
-
-/*
-    bit synopsis
-
-namespace std {
-
-  template <class T>
-    constexpr bool has_single_bit(T x) noexcept; // C++20
-  template <class T>
-    constexpr T bit_ceil(T x);                   // C++20
-  template <class T>
-    constexpr T bit_floor(T x) noexcept;         // C++20
-  template <class T>
-    constexpr T bit_width(T x) noexcept;         // C++20
-
-  // 23.20.2, rotating
-  template<class T>
-    constexpr T rotl(T x, unsigned int s) noexcept; // C++20
-  template<class T>
-    constexpr T rotr(T x, unsigned int s) noexcept; // C++20
-
-  // 23.20.3, counting
-  template<class T>
-    constexpr int countl_zero(T x) noexcept;  // C++20
-  template<class T>
-    constexpr int countl_one(T x) noexcept;   // C++20
-  template<class T>
-    constexpr int countr_zero(T x) noexcept;  // C++20
-  template<class T>
-    constexpr int countr_one(T x) noexcept;   // C++20
-  template<class T>
-    constexpr int popcount(T x) noexcept;     // C++20
-
-  // 20.15.9, endian
-  enum class endian {
-    little = see below,        // C++20
-    big = see below,           // C++20
-    native = see below         // C++20
-};
-
-} // namespace std
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/cstdint>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-#include <cuda/std/version>
-
-_CCCL_PUSH_MACROS
-
-#if defined(_CCCL_COMPILER_MSVC)
-#  include <intrin.h>
-#endif // _CCCL_COMPILER_MSVC
-
-#if defined(_CCCL_COMPILER_IBM)
-#  include <cuda/std/detail/libcxx/include/support/ibm/support.h>
-#endif // _CCCL_COMPILER_IBM
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-#define _LIBCUDACXX_BIT_CONSTEXPR constexpr
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz2(uint64_t __x, int __c) noexcept
-{
-  return (__x & 0x1) ? __c : __c + 1;
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz4(uint64_t __x, int __c) noexcept
-{
-  return __binary_ctz2(__x >> 2 * !(__x & 0x3), __c + 2 * !(__x & 0x3));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz8(uint64_t __x, int __c) noexcept
-{
-  return __binary_ctz4(__x >> 4 * !(__x & 0x0F), __c + 4 * !(__x & 0x0F));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz16(uint64_t __x, int __c) noexcept
-{
-  return __binary_ctz8(__x >> 8 * !(__x & 0x00FF), __c + 8 * !(__x & 0x00FF));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz32(uint64_t __x, int __c) noexcept
-{
-  return __binary_ctz16(__x >> 16 * !(__x & 0x0000FFFF), __c + 16 * !(__x & 0x0000FFFF));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_ctz64(uint64_t __x) noexcept
-{
-  return __binary_ctz32(__x >> 32 * !(__x & 0x00000000FFFFFFFF), 32 * !(__x & 0x00000000FFFFFFFF));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz2(uint64_t __x, int __c)
-{
-  return !!(~__x & 0x2) ^ __c;
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz4(uint64_t __x, int __c)
-{
-  return __binary_clz2(__x >> 2 * !!(__x & 0xC), __c + 2 * !(__x & 0xC));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz8(uint64_t __x, int __c)
-{
-  return __binary_clz4(__x >> 4 * !!(__x & 0xF0), __c + 4 * !(__x & 0xF0));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz16(uint64_t __x, int __c)
-{
-  return __binary_clz8(__x >> 8 * !!(__x & 0xFF00), __c + 8 * !(__x & 0xFF00));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz32(uint64_t __x, int __c)
-{
-  return __binary_clz16(__x >> 16 * !!(__x & 0xFFFF0000), __c + 16 * !(__x & 0xFFFF0000));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __binary_clz64(uint64_t __x)
-{
-  return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc8(uint64_t __x)
-{
-  return static_cast<int>((__x * 0x0101010101010101) >> 56);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc16(uint64_t __x)
-{
-  return __fallback_popc8((__x + (__x >> 4)) & 0x0f0f0f0f0f0f0f0f);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc32(uint64_t __x)
-{
-  return __fallback_popc16((__x & 0x3333333333333333) + ((__x >> 2) & 0x3333333333333333));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __fallback_popc64(uint64_t __x)
-{
-  return __fallback_popc32(__x - ((__x >> 1) & 0x5555555555555555));
-}
-
-#ifndef _CCCL_COMPILER_MSVC
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(
-      NV_IS_DEVICE, (return (!__x) ? sizeof(unsigned) * 8 : __ffs(__x) - 1;), (return __builtin_ctz(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __binary_ctz32(static_cast<uint64_t>(__x), 0);
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_ctz(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_ctz(unsigned long __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(
-      NV_IS_DEVICE, (return (!__x) ? sizeof(unsigned long) * 8 : __ffsll(__x) - 1;), (return __builtin_ctzl(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __binary_ctz64(static_cast<uint64_t>(__x));
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_ctzl(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_ctz(unsigned long long __x) noexcept
-{
-// For whatever reason __builtin_ctzll does not compile although it should
-#  if 1 // def _CCCL_COMPILER_NVRTC
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                      (return (!__x) ? sizeof(unsigned long long) * 8 : __ffsll(__x) - 1;),
-                      (return __builtin_ctzll(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __binary_ctz64(static_cast<uint64_t>(__x));
-#  else // 0
-  return __builtin_ctzll(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_clz(unsigned __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clz(__x);), (return __builtin_clz(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  return __binary_clz32(static_cast<uint64_t>(__x), 0);
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_clz(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_clz(unsigned long __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzl(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __binary_clz64(static_cast<uint64_t>(__x));
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_clzl(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_clz(unsigned long long __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzll(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __binary_clz64(static_cast<uint64_t>(__x));
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_clzll(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_popcount(unsigned __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popc(__x);), (return __builtin_popcount(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __fallback_popc64(static_cast<uint64_t>(__x));
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_popcount(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_popcount(unsigned long __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountl(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __fallback_popc64(static_cast<uint64_t>(__x));
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_popcountl(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __libcpp_popcount(unsigned long long __x) noexcept
-{
-#  if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_CUDACC_BELOW_11_3))
-#    if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountll(__x);))
-  }
-#    endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_CCCL_STD_VER >= 2014)
-
-  return __fallback_popc64(static_cast<uint64_t>(__x));
-#  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
-  return __builtin_popcountll(__x);
-#  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
-}
-
-#else // _CCCL_COMPILER_MSVC
-
-// Precondition:  __x != 0
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned __x)
-{
-  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
-  static_assert(sizeof(unsigned long) == 4, "");
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    unsigned long __where = 0;
-    if (_BitScanForward(&__where, __x))
-    {
-      return static_cast<int>(__where);
-    }
-    return 32;
-  }
-#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
-
-  return __binary_ctz32(static_cast<uint64_t>(__x), 0);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned long __x)
-{
-  static_assert(sizeof(unsigned long) == sizeof(unsigned), "");
-  return __libcpp_ctz(static_cast<unsigned>(__x));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(unsigned long long __x)
-{
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    unsigned long __where = 0;
-#    if defined(_LIBCUDACXX_HAS_BITSCAN64) && (defined(_M_AMD64) || defined(__x86_64__))
-    if (_BitScanForward64(&__where, __x))
-    {
-      return static_cast<int>(__where);
-    }
-#    else
-    // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls.
-    if (_BitScanForward(&__where, static_cast<unsigned long>(__x)))
-    {
-      return static_cast<int>(__where);
-    }
-    if (_BitScanForward(&__where, static_cast<unsigned long>(__x >> 32)))
-    {
-      return static_cast<int>(__where + 32);
-    }
-#    endif
-    return 64;
-  }
-#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
-
-  return __binary_ctz64(__x);
-}
-
-// Precondition:  __x != 0
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(unsigned __x)
-{
-  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
-  static_assert(sizeof(unsigned long) == 4, "");
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    unsigned long __where = 0;
-    if (_BitScanReverse(&__where, __x))
-    {
-      return static_cast<int>(31 - __where);
-    }
-    return 32; // Undefined Behavior.
-  }
-#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
-
-  return __binary_clz32(static_cast<uint64_t>(__x), 0);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(unsigned long __x)
-{
-  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
-  return __libcpp_clz(static_cast<unsigned>(__x));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(unsigned long long __x)
-{
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    unsigned long __where = 0;
-#    if defined(_LIBCUDACXX_HAS_BITSCAN64)
-    if (_BitScanReverse64(&__where, __x))
-    {
-      return static_cast<int>(63 - __where);
-    }
-#    else
-    // Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls.
-    if (_BitScanReverse(&__where, static_cast<unsigned long>(__x >> 32)))
-    {
-      return static_cast<int>(63 - (__where + 32));
-    }
-    if (_BitScanReverse(&__where, static_cast<unsigned long>(__x)))
-    {
-      return static_cast<int>(63 - __where);
-    }
-#    endif
-    return 64; // Undefined Behavior.
-  }
-#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
-
-  return __binary_clz64(static_cast<uint64_t>(__x));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popcount(unsigned __x)
-{
-  static_assert(sizeof(unsigned) == 4, "");
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    return static_cast<int>(__popcnt(__x));
-  }
-#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
-
-  return __fallback_popc64(static_cast<uint64_t>(__x));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popcount(unsigned long __x)
-{
-  static_assert(sizeof(unsigned long) == 4, "");
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    return static_cast<int>(__popcnt(__x));
-  }
-#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)
-
-  return __fallback_popc64(static_cast<uint64_t>(__x));
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popcount(unsigned long long __x)
-{
-  static_assert(sizeof(unsigned long long) == 8, "");
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
-  if (!__libcpp_is_constant_evaluated())
-  {
-    return static_cast<int>(__popcnt64(__x));
-  }
-#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__
-
-  return __fallback_popc64(static_cast<uint64_t>(__x));
-}
-
-#endif // _CCCL_COMPILER_MSVC
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR _Tp __rotl(_Tp __t, unsigned int __cnt) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned");
-  using __nlt = numeric_limits<_Tp>;
-
-  return ((__cnt % __nlt::digits) == 0)
-         ? __t
-         : (__t << (__cnt % __nlt::digits)) | (__t >> (__nlt::digits - (__cnt % __nlt::digits)));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR _Tp __rotr(_Tp __t, unsigned int __cnt) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned");
-  using __nlt = numeric_limits<_Tp>;
-
-  return ((__cnt % __nlt::digits) == 0)
-         ? __t
-         : (__t >> (__cnt % __nlt::digits)) | (__t << (__nlt::digits - (__cnt % __nlt::digits)));
-}
-
-// Forward decl for recursive use in split word operations
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countr_zero(_Tp __t) noexcept;
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) <= sizeof(unsigned int), int>
-__countr_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_ctz(static_cast<unsigned int>(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) == sizeof(unsigned long long), int>
-__countr_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_ctz(static_cast<unsigned long long>(__t));
-}
-
-template <typename _Tp, int _St = sizeof(_Tp) / sizeof(unsigned long long)>
-struct __countr_zero_rsh_impl
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __short_circuit(_Tp __t, int __cur, int __count)
-  {
-    // Stops processing early if non-zero
-    return (__cur == numeric_limits<unsigned long long>::digits)
-           ? __countr_zero_rsh_impl<_Tp, _St - 1>::__count(__t, __cur + __count)
-           : __cur + __count;
-  }
-
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t, int __count)
-  {
-    return __short_circuit(
-      __t >> numeric_limits<unsigned long long>::digits, __countr_zero(static_cast<unsigned long long>(__t)), __count);
-  }
-};
-
-template <typename _Tp>
-struct __countr_zero_rsh_impl<_Tp, 1>
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t, int __count)
-  {
-    return __count + __countr_zero(static_cast<unsigned long long>(__t));
-  }
-};
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int>
-__countr_zero_dispatch(_Tp __t) noexcept
-{
-  return __countr_zero_rsh_impl<_Tp>::__count(__t, 0);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countr_zero(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned");
-
-  return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
-}
-
-// Forward decl for recursive use in split word operations
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_zero(_Tp __t) noexcept;
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) <= sizeof(unsigned int), int>
-__countl_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_clz(static_cast<unsigned int>(__t))
-       - (numeric_limits<unsigned int>::digits - numeric_limits<_Tp>::digits);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) == sizeof(unsigned long long), int>
-__countl_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_clz(static_cast<unsigned long long>(__t))
-       - (numeric_limits<unsigned long long>::digits - numeric_limits<_Tp>::digits);
-}
-
-template <typename _Tp, int _St = sizeof(_Tp) / sizeof(unsigned long long)>
-struct __countl_zero_rotl_impl
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __short_circuit(_Tp __t, int __cur)
-  {
-    // This stops processing early if the current word is not empty
-    return (__cur == numeric_limits<unsigned long long>::digits)
-           ? __cur + __countl_zero_rotl_impl<_Tp, _St - 1>::__count(__t)
-           : __cur;
-  }
-
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_iter(_Tp __t)
-  {
-    // After rotating pass result of clz to another step for processing
-    return __short_circuit(__t, __countl_zero(static_cast<unsigned long long>(__t)));
-  }
-
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t)
-  {
-    return __countl_iter(__rotl(__t, numeric_limits<unsigned long long>::digits));
-  }
-};
-
-template <typename _Tp>
-struct __countl_zero_rotl_impl<_Tp, 1>
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t)
-  {
-    return __countl_zero(static_cast<unsigned long long>(__rotl(__t, numeric_limits<unsigned long long>::digits)));
-  }
-};
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int>
-__countl_zero_dispatch(_Tp __t) noexcept
-{
-  return __countl_zero_rotl_impl<_Tp>::__count(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_zero(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
-  return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countl_one(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
-  return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __countr_one(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned");
-  return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) <= sizeof(unsigned int), int>
-__popcount_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_popcount(static_cast<unsigned int>(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) == sizeof(unsigned long long), int>
-__popcount_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_popcount(static_cast<unsigned long long>(__t));
-}
-
-template <typename _Tp, int _St = sizeof(_Tp) / sizeof(unsigned long long)>
-struct __popcount_rsh_impl
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t)
-  {
-    return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits<unsigned long long>::digits)
-         + __libcpp_popcount(static_cast<unsigned long long>(__t));
-  }
-};
-
-template <typename _Tp>
-struct __popcount_rsh_impl<_Tp, 1>
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __count(_Tp __t)
-  {
-    return __libcpp_popcount(static_cast<unsigned long long>(__t));
-  }
-};
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int>
-__popcount_dispatch(_Tp __t) noexcept
-{
-  return __popcount_rsh_impl<_Tp>::__count(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR int __popcount(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned");
-
-  return __popcount_dispatch(__t);
-}
-
-// integral log base 2
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR unsigned __bit_log2(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
-  return std::numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR bool __has_single_bit(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
-  return __t != 0 && (((__t & (__t - 1)) == 0));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) >= sizeof(unsigned), _Tp>
-__ceil2(_Tp __t) noexcept
-{
-  // const unsigned __n = numeric_limits<_Tp>::digits - countl_zero((_Tp)(__t - 1u));
-  // _LIBCUDACXX_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to
-  // ceil2");
-  return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<sizeof(_Tp) < sizeof(unsigned), _Tp>
-__ceil2(_Tp __t) noexcept
-{
-  // const unsigned __n = numeric_limits<_Tp>::digits - countl_zero((_Tp)(__t - 1u));
-  // _LIBCUDACXX_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to
-  // ceil2");
-
-  // const unsigned __extra = numeric_limits<unsigned>::digits  - numeric_limits<_Tp>::digits;
-  // const unsigned __retVal = 1u << (__n + __extra);
-  return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
-                        + (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
-                >> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-rotl(_Tp __t, unsigned int __cnt) noexcept
-{
-  return __rotl(__t, __cnt);
-}
-
-// rotr
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-rotr(_Tp __t, unsigned int __cnt) noexcept
-{
-  return __rotr(__t, __cnt);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countl_zero(_Tp __t) noexcept
-{
-  return __countl_zero(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countl_one(_Tp __t) noexcept
-{
-  return __countl_one(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countr_zero(_Tp __t) noexcept
-{
-  return __countr_zero(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countr_one(_Tp __t) noexcept
-{
-  return __countr_one(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-popcount(_Tp __t) noexcept
-{
-  return __popcount(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool>
-has_single_bit(_Tp __t) noexcept
-{
-  return __has_single_bit(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-bit_floor(_Tp __t) noexcept
-{
-  return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-bit_ceil(_Tp __t) noexcept
-{
-  return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-bit_width(_Tp __t) noexcept
-{
-  return __t == 0 ? 0 : static_cast<_Tp>(__bit_log2(__t) + 1);
-}
-
-enum class endian
-{
-  little = 0xDEAD,
-  big    = 0xFACE,
-#if defined(_LIBCUDACXX_LITTLE_ENDIAN)
-  native = little
-#elif defined(_LIBCUDACXX_BIG_ENDIAN)
-  native = big
-#else
-  native = 0xCAFE
-#endif
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-_CCCL_POP_MACROS
-
-#endif // _LIBCUDACXX_BIT
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp
index 111f6f0331..a2b9cdf2d0 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_one.pass.cpp
@@ -39,15 +39,26 @@ template <typename T>
 __host__ __device__ constexpr bool constexpr_test()
 {
   using nl = cuda::std::numeric_limits<T>;
-  return cuda::std::countl_one(nl::max()) == nl::digits && cuda::std::countl_one(T(nl::max() - 1)) == nl::digits - 1
-      && cuda::std::countl_one(T(nl::max() - 2)) == nl::digits - 2
-      && cuda::std::countl_one(T(nl::max() - 3)) == nl::digits - 2
-      && cuda::std::countl_one(T(nl::max() - 4)) == nl::digits - 3
-      && cuda::std::countl_one(T(nl::max() - 5)) == nl::digits - 3
-      && cuda::std::countl_one(T(nl::max() - 6)) == nl::digits - 3
-      && cuda::std::countl_one(T(nl::max() - 7)) == nl::digits - 3
-      && cuda::std::countl_one(T(nl::max() - 8)) == nl::digits - 4
-      && cuda::std::countl_one(T(nl::max() - 9)) == nl::digits - 4;
+
+  static_assert(cuda::std::countl_one(nl::max()) == nl::digits, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 1)) == nl::digits - 1, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 2)) == nl::digits - 2, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 3)) == nl::digits - 2, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 4)) == nl::digits - 3, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 5)) == nl::digits - 3, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 6)) == nl::digits - 3, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 7)) == nl::digits - 3, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 8)) == nl::digits - 4, "");
+  static_assert(cuda::std::countl_one(T(nl::max() - 9)) == nl::digits - 4, "");
+
+  return true;
+}
+
+template <typename T>
+__host__ __device__ inline void assert_countl_one(T val, int expected)
+{
+  volatile auto v = val;
+  assert(cuda::std::countl_one(v) == expected);
 }
 
 template <typename T>
@@ -57,36 +68,36 @@ __host__ __device__ void runtime_test()
   ASSERT_NOEXCEPT(cuda::std::countl_one(T(0)));
   const int dig = cuda::std::numeric_limits<T>::digits;
 
-  assert(cuda::std::countl_one(T(~121)) == dig - 7);
-  assert(cuda::std::countl_one(T(~122)) == dig - 7);
-  assert(cuda::std::countl_one(T(~123)) == dig - 7);
-  assert(cuda::std::countl_one(T(~124)) == dig - 7);
-  assert(cuda::std::countl_one(T(~125)) == dig - 7);
-  assert(cuda::std::countl_one(T(~126)) == dig - 7);
-  assert(cuda::std::countl_one(T(~127)) == dig - 7);
-  assert(cuda::std::countl_one(T(~128)) == dig - 8);
-  assert(cuda::std::countl_one(T(~129)) == dig - 8);
-  assert(cuda::std::countl_one(T(~130)) == dig - 8);
+  assert_countl_one(T(~121), dig - 7);
+  assert_countl_one(T(~122), dig - 7);
+  assert_countl_one(T(~123), dig - 7);
+  assert_countl_one(T(~124), dig - 7);
+  assert_countl_one(T(~125), dig - 7);
+  assert_countl_one(T(~126), dig - 7);
+  assert_countl_one(T(~127), dig - 7);
+  assert_countl_one(T(~128), dig - 8);
+  assert_countl_one(T(~129), dig - 8);
+  assert_countl_one(T(~130), dig - 8);
 }
 
 int main(int, char**)
 {
-  static_assert(constexpr_test<unsigned char>(), "");
-  static_assert(constexpr_test<unsigned short>(), "");
-  static_assert(constexpr_test<unsigned>(), "");
-  static_assert(constexpr_test<unsigned long>(), "");
-  static_assert(constexpr_test<unsigned long long>(), "");
-
-  static_assert(constexpr_test<uint8_t>(), "");
-  static_assert(constexpr_test<uint16_t>(), "");
-  static_assert(constexpr_test<uint32_t>(), "");
-  static_assert(constexpr_test<uint64_t>(), "");
-  static_assert(constexpr_test<size_t>(), "");
-  static_assert(constexpr_test<uintmax_t>(), "");
-  static_assert(constexpr_test<uintptr_t>(), "");
+  constexpr_test<unsigned char>();
+  constexpr_test<unsigned short>();
+  constexpr_test<unsigned>();
+  constexpr_test<unsigned long>();
+  constexpr_test<unsigned long long>();
+
+  constexpr_test<uint8_t>();
+  constexpr_test<uint16_t>();
+  constexpr_test<uint32_t>();
+  constexpr_test<uint64_t>();
+  constexpr_test<size_t>();
+  constexpr_test<uintmax_t>();
+  constexpr_test<uintptr_t>();
 
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  static_assert(constexpr_test<__uint128_t>(), "");
+  constexpr_test<__uint128_t>();
 #endif
 
   runtime_test<unsigned char>();
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp
index 5f2bab54d6..929d5c3d69 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countl_zero.pass.cpp
@@ -53,6 +53,13 @@ __host__ __device__ constexpr bool constexpr_test()
   return true;
 }
 
+template <typename T>
+__host__ __device__ inline void assert_countl_zero(T val, int expected)
+{
+  volatile auto v = val;
+  assert(cuda::std::countl_zero(v) == expected);
+}
+
 template <typename T>
 __host__ __device__ void runtime_test()
 {
@@ -60,36 +67,36 @@ __host__ __device__ void runtime_test()
   ASSERT_NOEXCEPT(cuda::std::countl_zero(T(0)));
   const int dig = cuda::std::numeric_limits<T>::digits;
 
-  assert(cuda::std::countl_zero(T(121)) == dig - 7);
-  assert(cuda::std::countl_zero(T(122)) == dig - 7);
-  assert(cuda::std::countl_zero(T(123)) == dig - 7);
-  assert(cuda::std::countl_zero(T(124)) == dig - 7);
-  assert(cuda::std::countl_zero(T(125)) == dig - 7);
-  assert(cuda::std::countl_zero(T(126)) == dig - 7);
-  assert(cuda::std::countl_zero(T(127)) == dig - 7);
-  assert(cuda::std::countl_zero(T(128)) == dig - 8);
-  assert(cuda::std::countl_zero(T(129)) == dig - 8);
-  assert(cuda::std::countl_zero(T(130)) == dig - 8);
+  assert_countl_zero(T(121), dig - 7);
+  assert_countl_zero(T(122), dig - 7);
+  assert_countl_zero(T(123), dig - 7);
+  assert_countl_zero(T(124), dig - 7);
+  assert_countl_zero(T(125), dig - 7);
+  assert_countl_zero(T(126), dig - 7);
+  assert_countl_zero(T(127), dig - 7);
+  assert_countl_zero(T(128), dig - 8);
+  assert_countl_zero(T(129), dig - 8);
+  assert_countl_zero(T(130), dig - 8);
 }
 
 int main(int, char**)
 {
-  static_assert(constexpr_test<unsigned char>(), "");
-  static_assert(constexpr_test<unsigned short>(), "");
-  static_assert(constexpr_test<unsigned>(), "");
-  static_assert(constexpr_test<unsigned long>(), "");
-  static_assert(constexpr_test<unsigned long long>(), "");
-
-  static_assert(constexpr_test<uint8_t>(), "");
-  static_assert(constexpr_test<uint16_t>(), "");
-  static_assert(constexpr_test<uint32_t>(), "");
-  static_assert(constexpr_test<uint64_t>(), "");
-  static_assert(constexpr_test<size_t>(), "");
-  static_assert(constexpr_test<uintmax_t>(), "");
-  static_assert(constexpr_test<uintptr_t>(), "");
+  constexpr_test<unsigned char>();
+  constexpr_test<unsigned short>();
+  constexpr_test<unsigned>();
+  constexpr_test<unsigned long>();
+  constexpr_test<unsigned long long>();
+
+  constexpr_test<uint8_t>();
+  constexpr_test<uint16_t>();
+  constexpr_test<uint32_t>();
+  constexpr_test<uint64_t>();
+  constexpr_test<size_t>();
+  constexpr_test<uintmax_t>();
+  constexpr_test<uintptr_t>();
 
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  static_assert(constexpr_test<__uint128_t>(), "");
+  constexpr_test<__uint128_t>();
 #endif
 
   runtime_test<unsigned char>();
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp
index cf80a8a5a7..74e81bb119 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_one.pass.cpp
@@ -38,11 +38,26 @@ enum class E2 : unsigned char
 template <typename T>
 __host__ __device__ constexpr bool constexpr_test()
 {
-  return cuda::std::countr_one(T(0)) == 0 && cuda::std::countr_one(T(1)) == 1 && cuda::std::countr_one(T(2)) == 0
-      && cuda::std::countr_one(T(3)) == 2 && cuda::std::countr_one(T(4)) == 0 && cuda::std::countr_one(T(5)) == 1
-      && cuda::std::countr_one(T(6)) == 0 && cuda::std::countr_one(T(7)) == 3 && cuda::std::countr_one(T(8)) == 0
-      && cuda::std::countr_one(T(9)) == 1
-      && cuda::std::countr_one(cuda::std::numeric_limits<T>::max()) == cuda::std::numeric_limits<T>::digits;
+  static_assert(cuda::std::countr_one(T(2)) == 0, "");
+  static_assert(cuda::std::countr_one(T(3)) == 2, "");
+  static_assert(cuda::std::countr_one(T(4)) == 0, "");
+  static_assert(cuda::std::countr_one(T(5)) == 1, "");
+  static_assert(cuda::std::countr_one(T(6)) == 0, "");
+  static_assert(cuda::std::countr_one(T(7)) == 3, "");
+  static_assert(cuda::std::countr_one(T(8)) == 0, "");
+  static_assert(cuda::std::countr_one(T(9)) == 1, "");
+  static_assert(cuda::std::countr_one(T(0)) == 0, "");
+  static_assert(cuda::std::countr_one(T(1)) == 1, "");
+  static_assert(cuda::std::countr_one(cuda::std::numeric_limits<T>::max()) == cuda::std::numeric_limits<T>::digits, "");
+
+  return true;
+}
+
+template <typename T>
+__host__ __device__ inline void assert_countr_one(T val, int expected)
+{
+  volatile auto v = val;
+  assert(cuda::std::countr_one(v) == expected);
 }
 
 template <typename T>
@@ -51,36 +66,36 @@ __host__ __device__ void runtime_test()
   ASSERT_SAME_TYPE(int, decltype(cuda::std::countr_one(T(0))));
   ASSERT_NOEXCEPT(cuda::std::countr_one(T(0)));
 
-  assert(cuda::std::countr_one(T(121)) == 1);
-  assert(cuda::std::countr_one(T(122)) == 0);
-  assert(cuda::std::countr_one(T(123)) == 2);
-  assert(cuda::std::countr_one(T(124)) == 0);
-  assert(cuda::std::countr_one(T(125)) == 1);
-  assert(cuda::std::countr_one(T(126)) == 0);
-  assert(cuda::std::countr_one(T(127)) == 7);
-  assert(cuda::std::countr_one(T(128)) == 0);
-  assert(cuda::std::countr_one(T(129)) == 1);
-  assert(cuda::std::countr_one(T(130)) == 0);
+  assert_countr_one(T(121), 1);
+  assert_countr_one(T(122), 0);
+  assert_countr_one(T(123), 2);
+  assert_countr_one(T(124), 0);
+  assert_countr_one(T(125), 1);
+  assert_countr_one(T(126), 0);
+  assert_countr_one(T(127), 7);
+  assert_countr_one(T(128), 0);
+  assert_countr_one(T(129), 1);
+  assert_countr_one(T(130), 0);
 }
 
 int main(int, char**)
 {
-  static_assert(constexpr_test<unsigned char>(), "");
-  static_assert(constexpr_test<unsigned short>(), "");
-  static_assert(constexpr_test<unsigned>(), "");
-  static_assert(constexpr_test<unsigned long>(), "");
-  static_assert(constexpr_test<unsigned long long>(), "");
-
-  static_assert(constexpr_test<uint8_t>(), "");
-  static_assert(constexpr_test<uint16_t>(), "");
-  static_assert(constexpr_test<uint32_t>(), "");
-  static_assert(constexpr_test<uint64_t>(), "");
-  static_assert(constexpr_test<size_t>(), "");
-  static_assert(constexpr_test<uintmax_t>(), "");
-  static_assert(constexpr_test<uintptr_t>(), "");
+  constexpr_test<unsigned char>();
+  constexpr_test<unsigned short>();
+  constexpr_test<unsigned>();
+  constexpr_test<unsigned long>();
+  constexpr_test<unsigned long long>();
+
+  constexpr_test<uint8_t>();
+  constexpr_test<uint16_t>();
+  constexpr_test<uint32_t>();
+  constexpr_test<uint64_t>();
+  constexpr_test<size_t>();
+  constexpr_test<uintmax_t>();
+  constexpr_test<uintptr_t>();
 
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  static_assert(constexpr_test<__uint128_t>(), "");
+  constexpr_test<__uint128_t>();
 #endif
 
   runtime_test<unsigned char>();
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp
index 4c4da8cead..75a552ccf9 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp
@@ -11,7 +11,7 @@
 // template <class T>
 //   constexpr int countr_zero(T x) noexcept;
 
-// Returns: The number of consecutive 0 bits, starting from the most significant bit.
+// Returns: The number of consecutive 0 bits, starting from the least significant bit.
 //   [ Note: Returns N if x == 0. ]
 //
 // Remarks: This function shall not participate in overload resolution unless
@@ -38,11 +38,26 @@ enum class E2 : unsigned char
 template <typename T>
 __host__ __device__ constexpr bool constexpr_test()
 {
-  return cuda::std::countr_zero(T(0)) == cuda::std::numeric_limits<T>::digits && cuda::std::countr_zero(T(1)) == 0
-      && cuda::std::countr_zero(T(2)) == 1 && cuda::std::countr_zero(T(3)) == 0 && cuda::std::countr_zero(T(4)) == 2
-      && cuda::std::countr_zero(T(5)) == 0 && cuda::std::countr_zero(T(6)) == 1 && cuda::std::countr_zero(T(7)) == 0
-      && cuda::std::countr_zero(T(8)) == 3 && cuda::std::countr_zero(T(9)) == 0
-      && cuda::std::countr_zero(cuda::std::numeric_limits<T>::max()) == 0;
+  static_assert(cuda::std::countr_zero(T(1)) == 0, "");
+  static_assert(cuda::std::countr_zero(T(2)) == 1, "");
+  static_assert(cuda::std::countr_zero(T(3)) == 0, "");
+  static_assert(cuda::std::countr_zero(T(4)) == 2, "");
+  static_assert(cuda::std::countr_zero(T(5)) == 0, "");
+  static_assert(cuda::std::countr_zero(T(6)) == 1, "");
+  static_assert(cuda::std::countr_zero(T(7)) == 0, "");
+  static_assert(cuda::std::countr_zero(T(8)) == 3, "");
+  static_assert(cuda::std::countr_zero(T(9)) == 0, "");
+  static_assert(cuda::std::countr_zero(T(0)) == cuda::std::numeric_limits<T>::digits, "");
+  static_assert(cuda::std::countr_zero(cuda::std::numeric_limits<T>::max()) == 0, "");
+
+  return true;
+}
+
+template <typename T>
+__host__ __device__ inline void assert_countr_zero(T val, int expected)
+{
+  volatile auto v = val;
+  assert(cuda::std::countr_zero(v) == expected);
 }
 
 template <typename T>
@@ -51,36 +66,36 @@ __host__ __device__ void runtime_test()
   ASSERT_SAME_TYPE(int, decltype(cuda::std::countr_zero(T(0))));
   ASSERT_NOEXCEPT(cuda::std::countr_zero(T(0)));
 
-  assert(cuda::std::countr_zero(T(121)) == 0);
-  assert(cuda::std::countr_zero(T(122)) == 1);
-  assert(cuda::std::countr_zero(T(123)) == 0);
-  assert(cuda::std::countr_zero(T(124)) == 2);
-  assert(cuda::std::countr_zero(T(125)) == 0);
-  assert(cuda::std::countr_zero(T(126)) == 1);
-  assert(cuda::std::countr_zero(T(127)) == 0);
-  assert(cuda::std::countr_zero(T(128)) == 7);
-  assert(cuda::std::countr_zero(T(129)) == 0);
-  assert(cuda::std::countr_zero(T(130)) == 1);
+  assert_countr_zero(T(121), 0);
+  assert_countr_zero(T(122), 1);
+  assert_countr_zero(T(123), 0);
+  assert_countr_zero(T(124), 2);
+  assert_countr_zero(T(125), 0);
+  assert_countr_zero(T(126), 1);
+  assert_countr_zero(T(127), 0);
+  assert_countr_zero(T(128), 7);
+  assert_countr_zero(T(129), 0);
+  assert_countr_zero(T(130), 1);
 }
 
 int main(int, char**)
 {
-  static_assert(constexpr_test<unsigned char>(), "");
-  static_assert(constexpr_test<unsigned short>(), "");
-  static_assert(constexpr_test<unsigned>(), "");
-  static_assert(constexpr_test<unsigned long>(), "");
-  static_assert(constexpr_test<unsigned long long>(), "");
-
-  static_assert(constexpr_test<uint8_t>(), "");
-  static_assert(constexpr_test<uint16_t>(), "");
-  static_assert(constexpr_test<uint32_t>(), "");
-  static_assert(constexpr_test<uint64_t>(), "");
-  static_assert(constexpr_test<size_t>(), "");
-  static_assert(constexpr_test<uintmax_t>(), "");
-  static_assert(constexpr_test<uintptr_t>(), "");
+  constexpr_test<unsigned char>();
+  constexpr_test<unsigned short>();
+  constexpr_test<unsigned>();
+  constexpr_test<unsigned long>();
+  constexpr_test<unsigned long long>();
+
+  constexpr_test<uint8_t>();
+  constexpr_test<uint16_t>();
+  constexpr_test<uint32_t>();
+  constexpr_test<uint64_t>();
+  constexpr_test<size_t>();
+  constexpr_test<uintmax_t>();
+  constexpr_test<uintptr_t>();
 
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  static_assert(constexpr_test<__uint128_t>(), "");
+  constexpr_test<__uint128_t>();
 #endif
 
   runtime_test<unsigned char>();
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp
index 8e70c9ae56..393b9d31bd 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/popcount.pass.cpp
@@ -42,11 +42,26 @@ enum class E2 : unsigned char
 template <typename T>
 __host__ __device__ constexpr bool constexpr_test()
 {
-  return cuda::std::popcount(T(0)) == 0 && cuda::std::popcount(T(1)) == 1 && cuda::std::popcount(T(2)) == 1
-      && cuda::std::popcount(T(3)) == 2 && cuda::std::popcount(T(4)) == 1 && cuda::std::popcount(T(5)) == 2
-      && cuda::std::popcount(T(6)) == 2 && cuda::std::popcount(T(7)) == 3 && cuda::std::popcount(T(8)) == 1
-      && cuda::std::popcount(T(9)) == 2
-      && cuda::std::popcount(cuda::std::numeric_limits<T>::max()) == cuda::std::numeric_limits<T>::digits;
+  static_assert(cuda::std::popcount(T(0)) == 0, "");
+  static_assert(cuda::std::popcount(T(1)) == 1, "");
+  static_assert(cuda::std::popcount(T(2)) == 1, "");
+  static_assert(cuda::std::popcount(T(3)) == 2, "");
+  static_assert(cuda::std::popcount(T(4)) == 1, "");
+  static_assert(cuda::std::popcount(T(5)) == 2, "");
+  static_assert(cuda::std::popcount(T(6)) == 2, "");
+  static_assert(cuda::std::popcount(T(7)) == 3, "");
+  static_assert(cuda::std::popcount(T(8)) == 1, "");
+  static_assert(cuda::std::popcount(T(9)) == 2, "");
+  static_assert(cuda::std::popcount(cuda::std::numeric_limits<T>::max()) == cuda::std::numeric_limits<T>::digits, "");
+
+  return true;
+}
+
+template <typename T>
+__host__ __device__ inline void assert_popcount(T val, int expected)
+{
+  volatile auto v = val;
+  assert(cuda::std::popcount(v) == expected);
 }
 
 template <typename T>
@@ -55,36 +70,36 @@ __host__ __device__ void runtime_test()
   ASSERT_SAME_TYPE(int, decltype(cuda::std::popcount(T(0))));
   ASSERT_NOEXCEPT(cuda::std::popcount(T(0)));
 
-  assert(cuda::std::popcount(T(121)) == 5);
-  assert(cuda::std::popcount(T(122)) == 5);
-  assert(cuda::std::popcount(T(123)) == 6);
-  assert(cuda::std::popcount(T(124)) == 5);
-  assert(cuda::std::popcount(T(125)) == 6);
-  assert(cuda::std::popcount(T(126)) == 6);
-  assert(cuda::std::popcount(T(127)) == 7);
-  assert(cuda::std::popcount(T(128)) == 1);
-  assert(cuda::std::popcount(T(129)) == 2);
-  assert(cuda::std::popcount(T(130)) == 2);
+  assert_popcount(T(121), 5);
+  assert_popcount(T(122), 5);
+  assert_popcount(T(123), 6);
+  assert_popcount(T(124), 5);
+  assert_popcount(T(125), 6);
+  assert_popcount(T(126), 6);
+  assert_popcount(T(127), 7);
+  assert_popcount(T(128), 1);
+  assert_popcount(T(129), 2);
+  assert_popcount(T(130), 2);
 }
 
 int main(int, char**)
 {
-  static_assert(constexpr_test<unsigned char>(), "");
-  static_assert(constexpr_test<unsigned short>(), "");
-  static_assert(constexpr_test<unsigned>(), "");
-  static_assert(constexpr_test<unsigned long>(), "");
-  static_assert(constexpr_test<unsigned long long>(), "");
-
-  static_assert(constexpr_test<uint8_t>(), "");
-  static_assert(constexpr_test<uint16_t>(), "");
-  static_assert(constexpr_test<uint32_t>(), "");
-  static_assert(constexpr_test<uint64_t>(), "");
-  static_assert(constexpr_test<size_t>(), "");
-  static_assert(constexpr_test<uintmax_t>(), "");
-  static_assert(constexpr_test<uintptr_t>(), "");
+  constexpr_test<unsigned char>();
+  constexpr_test<unsigned short>();
+  constexpr_test<unsigned>();
+  constexpr_test<unsigned long>();
+  constexpr_test<unsigned long long>();
+
+  constexpr_test<uint8_t>();
+  constexpr_test<uint16_t>();
+  constexpr_test<uint32_t>();
+  constexpr_test<uint64_t>();
+  constexpr_test<size_t>();
+  constexpr_test<uintmax_t>();
+  constexpr_test<uintptr_t>();
 
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  static_assert(constexpr_test<__uint128_t>(), "");
+  constexpr_test<__uint128_t>();
 #endif
 
   runtime_test<unsigned char>();

From fed3ec1abe2d603e22ee12fa9a61010ae9b9b553 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 16 Aug 2024 16:41:27 +0200
Subject: [PATCH 31/33] Proclaim pair and tuple trivially relocatable (#2010)

---
 .../cuda/std/detail/libcxx/include/complex    |  3 +
 thrust/testing/type_traits.cu                 | 59 +++++++++++++++++++
 thrust/thrust/pair.h                          |  8 +++
 thrust/thrust/tuple.h                         |  6 ++
 .../type_traits/is_trivially_relocatable.h    | 16 +++++
 5 files changed, 92 insertions(+)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
index dc596bd65b..b03b7d9ee6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
@@ -1467,6 +1467,9 @@ inline namespace literals
 inline namespace complex_literals
 {
 #  ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
+// NOTE: if you get a warning from GCC <7 here that "literal operator suffixes not preceded by ‘_’ are reserved for
+// future standardization" then we are sorry. The warning was implemented before GCC 7, but can only be disabled since
+// GCC 7. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69523
 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<long double> operator""il(long double __im)
 {
   return {0.0l, __im};
diff --git a/thrust/testing/type_traits.cu b/thrust/testing/type_traits.cu
index bab73c76c4..f4ba3d0896 100644
--- a/thrust/testing/type_traits.cu
+++ b/thrust/testing/type_traits.cu
@@ -5,8 +5,17 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/pair.h>
+#include <thrust/tuple.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
+#if defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000
+// This header pulls in an unsuppressable warning on GCC 6
+#  include <cuda/std/complex>
+#endif // defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
+
 #include <unittest/unittest.h>
 
 void TestIsContiguousIterator()
@@ -146,3 +155,53 @@ void TestIsCommutative()
   }
 }
 DECLARE_UNITTEST(TestIsCommutative);
+
+struct NonTriviallyCopyable
+{
+  NonTriviallyCopyable(const NonTriviallyCopyable&) {}
+};
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(NonTriviallyCopyable);
+
+static_assert(!::cuda::std::is_trivially_copyable<NonTriviallyCopyable>::value, "");
+static_assert(thrust::is_trivially_relocatable<NonTriviallyCopyable>::value, "");
+
+void TestTriviallyRelocatable()
+{
+  static_assert(thrust::is_trivially_relocatable<int>::value, "");
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+  static_assert(thrust::is_trivially_relocatable<__half>::value, "");
+  static_assert(thrust::is_trivially_relocatable<int1>::value, "");
+  static_assert(thrust::is_trivially_relocatable<int2>::value, "");
+  static_assert(thrust::is_trivially_relocatable<int3>::value, "");
+  static_assert(thrust::is_trivially_relocatable<int4>::value, "");
+#  ifndef _LIBCUDACXX_HAS_NO_INT128
+  static_assert(thrust::is_trivially_relocatable<__int128>::value, "");
+#  endif // _LIBCUDACXX_HAS_NO_INT128
+#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#if defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000
+  static_assert(thrust::is_trivially_relocatable<thrust::complex<float>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<::cuda::std::complex<float>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<thrust::pair<int, thrust::complex<float>>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<::cuda::std::pair<int, ::cuda::std::complex<float>>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<thrust::tuple<int, thrust::complex<float>, char>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<::cuda::std::tuple<int, ::cuda::std::complex<float>, char>>::value,
+                "");
+#endif // defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION >= 70000
+  static_assert(thrust::is_trivially_relocatable<
+                  ::cuda::std::tuple<thrust::pair<int, thrust::tuple<int, ::cuda::std::tuple<>>>,
+                                     thrust::tuple<::cuda::std::pair<int, thrust::tuple<>>, int>>>::value,
+                "");
+
+  static_assert(!thrust::is_trivially_relocatable<thrust::pair<int, std::string>>::value, "");
+  static_assert(!thrust::is_trivially_relocatable<::cuda::std::pair<int, std::string>>::value, "");
+  static_assert(!thrust::is_trivially_relocatable<thrust::tuple<int, float, std::string>>::value, "");
+  static_assert(!thrust::is_trivially_relocatable<::cuda::std::tuple<int, float, std::string>>::value, "");
+
+  // test propagation of relocatability through pair and tuple
+  static_assert(thrust::is_trivially_relocatable<NonTriviallyCopyable>::value, "");
+  static_assert(thrust::is_trivially_relocatable<thrust::pair<NonTriviallyCopyable, int>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<::cuda::std::pair<NonTriviallyCopyable, int>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<thrust::tuple<NonTriviallyCopyable>>::value, "");
+  static_assert(thrust::is_trivially_relocatable<::cuda::std::tuple<NonTriviallyCopyable>>::value, "");
+};
+DECLARE_UNITTEST(TestTriviallyRelocatable);
diff --git a/thrust/thrust/pair.h b/thrust/thrust/pair.h
index 0e567a35b6..def1aeaf17 100644
--- a/thrust/thrust/pair.h
+++ b/thrust/thrust/pair.h
@@ -30,6 +30,9 @@
 #  pragma system_header
 #endif // no system header
 
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+#include <cuda/std/__type_traits/conjunction.h>
 #include <cuda/std/utility>
 
 THRUST_NAMESPACE_BEGIN
@@ -117,6 +120,11 @@ make_pair(T1&& t1, T2&& t2)
 
 using _CUDA_VSTD::get;
 
+template <typename T, typename U>
+struct proclaim_trivially_relocatable<pair<T, U>>
+    : ::cuda::std::conjunction<is_trivially_relocatable<T>, is_trivially_relocatable<U>>
+{};
+
 /*! \endcond
  */
 
diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
index 1ff1031804..3e7cd87dae 100644
--- a/thrust/thrust/tuple.h
+++ b/thrust/thrust/tuple.h
@@ -39,6 +39,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
 #include <cuda/std/tuple>
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
@@ -232,6 +234,10 @@ inline _CCCL_HOST_DEVICE tuple<Ts&...> tie(Ts&... ts) noexcept
 
 using _CUDA_VSTD::get;
 
+template <typename... Ts>
+struct proclaim_trivially_relocatable<tuple<Ts...>> : ::cuda::std::conjunction<is_trivially_relocatable<Ts>...>
+{};
+
 /*! \endcond
  */
 
diff --git a/thrust/thrust/type_traits/is_trivially_relocatable.h b/thrust/thrust/type_traits/is_trivially_relocatable.h
index 7732d7c6ab..8566a51057 100644
--- a/thrust/thrust/type_traits/is_trivially_relocatable.h
+++ b/thrust/thrust/type_traits/is_trivially_relocatable.h
@@ -36,6 +36,10 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
+#include <cuda/std/__fwd/pair.h>
+#include <cuda/std/__fwd/tuple.h>
+#include <cuda/std/__type_traits/conjunction.h>
+
 #include <type_traits>
 
 THRUST_NAMESPACE_BEGIN
@@ -285,6 +289,18 @@ THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double3)
 THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double4)
 #endif
 
+THRUST_NAMESPACE_BEGIN
+template <typename T, typename U>
+struct proclaim_trivially_relocatable<::cuda::std::pair<T, U>>
+    : ::cuda::std::conjunction<is_trivially_relocatable<T>, is_trivially_relocatable<U>>
+{};
+
+template <typename... Ts>
+struct proclaim_trivially_relocatable<::cuda::std::tuple<Ts...>>
+    : ::cuda::std::conjunction<is_trivially_relocatable<Ts>...>
+{};
+THRUST_NAMESPACE_END
+
 /*! \endcond
  */
 

From 4a5dcc4f9e3ebaddcc05f2fb1b243d852f7cea99 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 16 Aug 2024 18:55:37 +0200
Subject: [PATCH 32/33] Make `cuda::std::min` constexpr in C++11 (#2249)

This should fix our rmm builds
---
 libcudacxx/include/cuda/std/__algorithm/min.h             | 8 ++------
 .../std/algorithms/alg.sorting/alg.min.max/min.pass.cpp   | 4 ++++
 .../algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp  | 4 ++++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__algorithm/min.h b/libcudacxx/include/cuda/std/__algorithm/min.h
index 5d1d826038..047d3eb294 100644
--- a/libcudacxx/include/cuda/std/__algorithm/min.h
+++ b/libcudacxx/include/cuda/std/__algorithm/min.h
@@ -30,20 +30,18 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp&
+_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp&
 min(const _Tp& __a, const _Tp& __b, _Compare __comp)
 {
   return __comp(__b, __a) ? __b : __a;
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& min(const _Tp& __a, const _Tp& __b)
+_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& min(const _Tp& __a, const _Tp& __b)
 {
   return _CUDA_VSTD::min(__a, __b, __less{});
 }
 
-#ifndef _LIBCUDACXX_CXX03_LANG
-
 template <class _Tp, class _Compare>
 _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
 min(initializer_list<_Tp> __t, _Compare __comp)
@@ -57,8 +55,6 @@ _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp m
   return *_CUDA_VSTD::min_element(__t.begin(), __t.end(), __less{});
 }
 
-#endif // _LIBCUDACXX_CXX03_LANG
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_POP_MACROS
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp
index 9f66fc5468..8d35bf42af 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min.pass.cpp
@@ -54,6 +54,10 @@ int main(int, char**)
   test();
 #if TEST_STD_VER >= 2014
   static_assert(test(), "");
+#else // TEST_STD_VER >= 2014
+  constexpr int x = 0;
+  constexpr int y = 1;
+  static_assert(&cuda::std::min(x, y) == &x, "");
 #endif // TEST_STD_VER >= 2014
 
   return 0;
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp
index aac001f744..b08c1948a2 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.min.max/min_comp.pass.cpp
@@ -56,6 +56,10 @@ int main(int, char**)
   test();
 #if TEST_STD_VER >= 2014
   static_assert(test(), "");
+#else // TEST_STD_VER >= 2014
+  constexpr int x = 0;
+  constexpr int y = 1;
+  static_assert(&cuda::std::min(x, y, cuda::std::greater<int>()) == &y, "");
 #endif // TEST_STD_VER >= 2014
 
   return 0;

From ba9e9bbc20dca1ac49b5da6b4d1716d85b4f495e Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 16 Aug 2024 19:33:47 +0200
Subject: [PATCH 33/33] Add `CCCL_DISABLE_NVTX` macro (#2173)

Fixes: #2172
---
 cub/cub/detail/nvtx.cuh        | 15 +++++++++++----
 cub/test/test_nvtx_disabled.cu | 19 +++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 cub/test/test_nvtx_disabled.cu

diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh
index d570df3adc..a8422263fa 100644
--- a/cub/cub/detail/nvtx.cuh
+++ b/cub/cub/detail/nvtx.cuh
@@ -37,11 +37,16 @@
 #  pragma system_header
 #endif // no system header
 
+#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+//! When this macro is defined, no NVTX ranges are emitted by CCCL
+#  define CCCL_DISABLE_NVTX
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
 // Enable the functionality of this header if:
 // * The NVTX3 C API is available in CTK
-// * NVTX is not explicitly disabled
+// * NVTX is not explicitly disabled (via CCCL_DISABLE_NVTX or NVTX_DISABLE)
 // * C++14 is availabl for cuda::std::optional
-#if __has_include(<nvtx3/nvToolsExt.h> ) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014
+#if __has_include(<nvtx3/nvToolsExt.h> ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014
 // Include our NVTX3 C++ wrapper if not available from the CTK
 #  if __has_include(<nvtx3/nvtx3.hpp>) // TODO(bgruber): replace by a check for the first CTK version shipping the header
 #    include <nvtx3/nvtx3.hpp>
@@ -96,7 +101,9 @@ CUB_NAMESPACE_END
 #    define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name)
 #    define CUB_DETAIL_NVTX_RANGE_SCOPE(name)
 #  endif // NVTX3_CPP_DEFINITIONS_V1_0
-#else // __has_include(<nvtx3/nvToolsExt.h> ) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014
+#else // __has_include(<nvtx3/nvToolsExt.h> ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER
+      // >= 2014
 #  define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name)
 #  define CUB_DETAIL_NVTX_RANGE_SCOPE(name)
-#endif // __has_include(<nvtx3/nvToolsExt.h> ) && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014
+#endif // __has_include(<nvtx3/nvToolsExt.h> ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER
+       // >= 2014
diff --git a/cub/test/test_nvtx_disabled.cu b/cub/test/test_nvtx_disabled.cu
new file mode 100644
index 0000000000..c6eba196b1
--- /dev/null
+++ b/cub/test/test_nvtx_disabled.cu
@@ -0,0 +1,19 @@
+#define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) static_assert(false, "");
+#define CCCL_DISABLE_NVTX
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuda/std/functional>
+
+#if defined(CCCL_DISABLE_NVTX) && defined(NVTX_VERSION)
+#  error "NVTX was included somewhere even though it is turned off via CCCL_DISABLE_NVTX"
+#endif // defined(CCCL_DISABLE_NVTX) && defined(NVTX_VERSION)
+
+int main()
+{
+  thrust::counting_iterator<int> it{0};
+  cub::DeviceFor::ForEach(it, it + 16, ::cuda::std::negate<int>{});
+  cudaDeviceSynchronize();
+}